diff options
49 files changed, 5638 insertions, 39 deletions
@@ -181,6 +181,7 @@ ^tools/misc/xc_shadow$ ^tools/misc/xen_cpuperf$ ^tools/misc/xen-detect$ +^tools/misc/xen-tmem-list-parse$ ^tools/misc/xenperf$ ^tools/misc/xenpm$ ^tools/pygrub/build/.*$ diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile index acd7067e57..e984df9088 100644 --- a/tools/libxc/Makefile +++ b/tools/libxc/Makefile @@ -21,6 +21,7 @@ CTRL_SRCS-y += xc_tbuf.c CTRL_SRCS-y += xc_pm.c CTRL_SRCS-y += xc_cpu_hotplug.c CTRL_SRCS-y += xc_resume.c +CTRL_SRCS-y += xc_tmem.c CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c CTRL_SRCS-$(CONFIG_SunOS) += xc_solaris.c diff --git a/tools/libxc/xc_tmem.c b/tools/libxc/xc_tmem.c new file mode 100644 index 0000000000..ba618ef98c --- /dev/null +++ b/tools/libxc/xc_tmem.c @@ -0,0 +1,83 @@ +/****************************************************************************** + * xc_tmem.c + * + * Copyright (C) 2008 Oracle Corp. + */ + +#include "xc_private.h" +#include <xen/tmem.h> + +static int do_tmem_op(int xc, tmem_op_t *op) +{ + int ret; + DECLARE_HYPERCALL; + + hypercall.op = __HYPERVISOR_tmem_op; + hypercall.arg[0] = (unsigned long)op; + if (lock_pages(op, sizeof(*op)) != 0) + { + PERROR("Could not lock memory for Xen hypercall"); + return -EFAULT; + } + if ((ret = do_xen_hypercall(xc, &hypercall)) < 0) + { + if ( errno == EACCES ) + DPRINTF("tmem operation failed -- need to" + " rebuild the user-space tool set?\n"); + } + unlock_pages(op, sizeof(*op)); + + return ret; +} + +int xc_tmem_control(int xc, + int32_t pool_id, + uint32_t subop, + uint32_t cli_id, + uint32_t arg1, + uint32_t arg2, + void *buf) +{ + tmem_op_t op; + int rc; + + op.cmd = TMEM_CONTROL; + op.pool_id = pool_id; + op.subop = subop; + op.cli_id = cli_id; + op.arg1 = arg1; + op.arg2 = arg2; + op.buf.p = buf; + + if (subop == TMEMC_LIST) { + if ((arg1 != 0) && (lock_pages(buf, arg1) != 0)) + { + PERROR("Could not lock memory for Xen hypercall"); + return -ENOMEM; + } + } + +#ifdef VALGRIND + if (arg1 != 0) + memset(buf, 0, arg1); +#endif + + rc = do_tmem_op(xc, &op); + + if (subop == TMEMC_LIST) { + if (arg1 != 0) + unlock_pages(buf, arg1); + } + + return rc; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index c9b1866b60..8a54d5775d 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -1267,4 +1267,15 @@ int xc_get_vcpu_migration_delay(int xc_handle, uint32_t *value); int xc_get_cpuidle_max_cstate(int xc_handle, uint32_t *value); int xc_set_cpuidle_max_cstate(int xc_handle, uint32_t value); +/** + * tmem operations + */ +int xc_tmem_control(int xc, + int32_t pool_id, + uint32_t subop, + uint32_t cli_id, + uint32_t arg1, + uint32_t arg2, + void *buf); + #endif /* XENCTRL_H */ diff --git a/tools/misc/Makefile b/tools/misc/Makefile index c309a3f106..b6a735bfdd 100644 --- a/tools/misc/Makefile +++ b/tools/misc/Makefile @@ -10,7 +10,7 @@ CFLAGS += $(INCLUDES) HDRS = $(wildcard *.h) -TARGETS-y := xenperf xenpm +TARGETS-y := xenperf xenpm xen-tmem-list-parse TARGETS-$(CONFIG_X86) += xen-detect TARGETS := $(TARGETS-y) @@ -22,7 +22,7 @@ INSTALL_BIN-y := xencons INSTALL_BIN-$(CONFIG_X86) += xen-detect INSTALL_BIN := $(INSTALL_BIN-y) -INSTALL_SBIN-y := xm xen-bugtool xen-python-path xend xenperf xsview xenpm +INSTALL_SBIN-y := xm xen-bugtool xen-python-path xend xenperf xsview xenpm xen-tmem-list-parse INSTALL_SBIN := $(INSTALL_SBIN-y) DEFAULT_PYTHON_PATH := $(shell $(XEN_ROOT)/tools/python/get-path) diff --git a/tools/misc/xen-tmem-list-parse.c b/tools/misc/xen-tmem-list-parse.c new file mode 100644 index 0000000000..383daee158 --- /dev/null +++ b/tools/misc/xen-tmem-list-parse.c @@ -0,0 +1,288 @@ +/* + * Parse output from tmem-list and reformat to human-readable + * + * NOTE: NEVER delete a parse call as this file documents backwards + * compatibility for older versions of tmem-list and we don't want to + * accidentally reuse an old tag + * + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. + */ + +#include <stdio.h> +#include <unistd.h> +#include <string.h> + +#define BUFSIZE 4096 +#define PAGE_SIZE 4096 + +unsigned long long parse(char *s,char *match) +{ + char *s1 = strstr(s,match); + unsigned long long ret; + + if ( s1 == NULL ) + return 0LL; + s1 += 2; + if ( *s1++ != ':' ) + return 0LL; + sscanf(s1,"%llu",&ret); + return ret; +} + +unsigned long long parse2(char *s,char *match1, char *match2) +{ + char match[3]; + match[0] = *match1; + match[1] = *match2; + match[2] = '\0'; + return parse(s,match); +} + +void parse_string(char *s,char *match, char *buf, int len) +{ + char *s1 = strstr(s,match); + int i; + + if ( s1 == NULL ) + return; + s1 += 2; + if ( *s1++ != ':' ) + return; + for ( i = 0; i < len; i++ ) + *buf++ = *s1++; +} + +void parse_sharers(char *s, char *match, char *buf, int len) +{ + char *s1 = strstr(s,match); + char *b = buf; + + if ( s1 == NULL ) + return; + while ( s1 ) + { + s1 += 2; + if (*s1++ != ':') + return; + while (*s1 <= '0' && *s1 <= '9') + *b++ = *s1++; + *b++ = ','; + s1 = strstr(s1,match); + } + if ( b != buf ) + *--b = '\0'; +} + +void parse_global(char *s) +{ + unsigned long long total_ops = parse(s,"Tt"); + unsigned long long errored_ops = parse(s,"Te"); + unsigned long long failed_copies = parse(s,"Cf"); + unsigned long long alloc_failed = parse(s,"Af"); + unsigned long long alloc_page_failed = parse(s,"Pf"); + unsigned long long avail_pages = parse(s,"Ta"); + unsigned long long low_on_memory = parse(s,"Lm"); + unsigned long long evicted_pgs = parse(s,"Et"); + unsigned long long evict_attempts = parse(s,"Ea"); + unsigned long long relinq_pgs = parse(s,"Rt"); + unsigned long long relinq_attempts = parse(s,"Ra"); + unsigned long long max_evicts_per_relinq = parse(s,"Rx"); + unsigned long long total_flush_pool = parse(s,"Fp"); + unsigned long long global_eph_count = parse(s,"Ec"); + unsigned long long global_eph_max = parse(s,"Em"); + unsigned long long obj_count = parse(s,"Oc"); + unsigned long long obj_max = parse(s,"Om"); + unsigned long long rtree_node_count = parse(s,"Nc"); + unsigned long long rtree_node_max = parse(s,"Nm"); + unsigned long long pgp_count = parse(s,"Pc"); + unsigned long long pgp_max = parse(s,"Pm"); + + printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n", + total_ops, errored_ops, avail_pages); + printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) " + "nodes=%llu (max=%llu)\n", + obj_count, obj_max, pgp_count, pgp_max, + rtree_node_count, rtree_node_max); + printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu " + "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, " + "max_evicts_per_relinq=%llu, flush_pools=%llu, " + "eph_count=%llu, eph_max=%llu\n", + failed_copies, alloc_failed, alloc_page_failed, low_on_memory, + evicted_pgs, evict_attempts, relinq_pgs, relinq_attempts, + max_evicts_per_relinq, total_flush_pool, + global_eph_count, global_eph_max); +} + +#define PARSE_CYC_COUNTER(s,x,prefix) unsigned long long \ + x##_count = parse2(s,prefix,"n"), \ + x##_sum_cycles = parse2(s,prefix,"t"), \ + x##_max_cycles = parse2(s,prefix,"x"), \ + x##_min_cycles = parse2(s,prefix,"m") +#define PRINTF_CYC_COUNTER(x,text) \ + if (x##_count) printf(text" avg=%llu, max=%llu, " \ + "min=%llu, samples=%llu\n", \ + x##_sum_cycles ? (x##_sum_cycles/x##_count) : 0, \ + x##_max_cycles, x##_min_cycles, x##_count) + +void parse_time_stats(char *s) +{ + PARSE_CYC_COUNTER(s,succ_get,"G"); + PARSE_CYC_COUNTER(s,succ_put,"P"); + PARSE_CYC_COUNTER(s,non_succ_get,"g"); + PARSE_CYC_COUNTER(s,non_succ_put,"p"); + PARSE_CYC_COUNTER(s,flush,"F"); + PARSE_CYC_COUNTER(s,flush_obj,"O"); + PARSE_CYC_COUNTER(s,pg_copy,"C"); + PARSE_CYC_COUNTER(s,compress,"c"); + PARSE_CYC_COUNTER(s,decompress,"d"); + + PRINTF_CYC_COUNTER(succ_get,"succ get cycles:"); + PRINTF_CYC_COUNTER(succ_put,"succ put cycles:"); + PRINTF_CYC_COUNTER(non_succ_get,"failed get cycles:"); + PRINTF_CYC_COUNTER(non_succ_put,"failed put cycles:"); + PRINTF_CYC_COUNTER(flush,"flush cycles:"); + PRINTF_CYC_COUNTER(flush_obj,"flush_obj cycles:"); + PRINTF_CYC_COUNTER(pg_copy,"page copy cycles:"); + PRINTF_CYC_COUNTER(compress,"compression cycles:"); + PRINTF_CYC_COUNTER(decompress,"decompression cycles:"); +} + +void parse_client(char *s) +{ + unsigned long cli_id = parse(s,"CI"); + unsigned long weight = parse(s,"ww"); + unsigned long cap = parse(s,"ca"); + unsigned long compress = parse(s,"co"); + unsigned long frozen = parse(s,"fr"); + unsigned long long eph_count = parse(s,"Ec"); + unsigned long long max_eph_count = parse(s,"Em"); + unsigned long long compressed_pages = parse(s,"cp"); + unsigned long long compressed_sum_size = parse(s,"cb"); + unsigned long long compress_poor = parse(s,"cn"); + unsigned long long compress_nomem = parse(s,"cm"); + + printf("domid%lu: weight=%lu,cap=%lu,compress=%d,frozen=%d," + "eph_count=%llu,max_eph=%llu," + "compression ratio=%lu%% (samples=%llu,poor=%llu,nomem=%llu)\n", + cli_id, weight, cap, compress?1:0, frozen?1:0, + eph_count, max_eph_count, + compressed_pages ? (long)((compressed_sum_size*100LL) / + (compressed_pages*PAGE_SIZE)) : 0, + compressed_pages, compress_poor, compress_nomem); + +} + +void parse_pool(char *s) +{ + char pool_type[3]; + unsigned long cli_id = parse(s,"CI"); + unsigned long pool_id = parse(s,"PI"); + unsigned long long pgp_count = parse(s,"Pc"); + unsigned long long max_pgp_count = parse(s,"Pm"); + unsigned long long obj_count = parse(s,"Oc"); + unsigned long long max_obj_count = parse(s,"Om"); + unsigned long long objnode_count = parse(s,"Nc"); + unsigned long long max_objnode_count = parse(s,"Nm"); + unsigned long long good_puts = parse(s,"ps"); + unsigned long long puts = parse(s,"pt"); + unsigned long long no_mem_puts = parse(s,"px"); + unsigned long long dup_puts_flushed = parse(s,"pd"); + unsigned long long dup_puts_replaced = parse(s,"pr"); + unsigned long long found_gets = parse(s,"gs"); + unsigned long long gets = parse(s,"gt"); + unsigned long long flushs_found = parse(s,"fs"); + unsigned long long flushs = parse(s,"ft"); + unsigned long long flush_objs_found = parse(s,"os"); + unsigned long long flush_objs = parse(s,"ot"); + + parse_string(s,"PT",pool_type,2); + printf("domid%lu,id%lu[%s]:pgp=%llu(max=%llu) obj=%llu(%llu) " + "objnode=%llu(%llu) puts=%llu/%llu/%llu(dup=%llu/%llu) " + "gets=%llu/%llu(%llu%%) " + "flush=%llu/%llu flobj=%llu/%llu\n", + cli_id, pool_id, pool_type, + pgp_count, max_pgp_count, obj_count, max_obj_count, + objnode_count, max_objnode_count, + good_puts, puts, no_mem_puts, + dup_puts_flushed, dup_puts_replaced, + found_gets, gets, + gets ? (found_gets*100LL)/gets : 0, + flushs_found, flushs, flush_objs_found, flush_objs); + +} + +void parse_shared_pool(char *s) +{ + char pool_type[3]; + char buf[BUFSIZE]; + unsigned long pool_id = parse(s,"PI"); + unsigned long long uid0 = parse(s,"U0"); + unsigned long long uid1 = parse(s,"U1"); + unsigned long long pgp_count = parse(s,"Pc"); + unsigned long long max_pgp_count = parse(s,"Pm"); + unsigned long long obj_count = parse(s,"Oc"); + unsigned long long max_obj_count = parse(s,"Om"); + unsigned long long objnode_count = parse(s,"Nc"); + unsigned long long max_objnode_count = parse(s,"Nm"); + unsigned long long good_puts = parse(s,"ps"); + unsigned long long puts = parse(s,"pt"); + unsigned long long no_mem_puts = parse(s,"px"); + unsigned long long dup_puts_flushed = parse(s,"pd"); + unsigned long long dup_puts_replaced = parse(s,"pr"); + unsigned long long found_gets = parse(s,"gs"); + unsigned long long gets = parse(s,"gt"); + unsigned long long flushs_found = parse(s,"fs"); + unsigned long long flushs = parse(s,"ft"); + unsigned long long flush_objs_found = parse(s,"os"); + unsigned long long flush_objs = parse(s,"ot"); + + parse_string(s,"PT",pool_type,2); + parse_sharers(s,"SC",buf,BUFSIZE); + printf("poolid=%lu[%s] uuid=%llu.%llu, shared-by:%s: " + "pgp=%llu(max=%llu) obj=%llu(%llu) " + "objnode=%llu(%llu) puts=%llu/%llu/%llu(dup=%llu/%llu) " + "gets=%llu/%llu(%llu%%) " + "flush=%llu/%llu flobj=%llu/%llu\n", + pool_id, pool_type, uid0, uid1, buf, + pgp_count, max_pgp_count, obj_count, max_obj_count, + objnode_count, max_objnode_count, + good_puts, puts, no_mem_puts, + dup_puts_flushed, dup_puts_replaced, + found_gets, gets, + gets ? (found_gets*100LL)/gets : 0, + flushs_found, flushs, flush_objs_found, flush_objs); +} + +int main(int ac, char **av) +{ + char *p, c; + char buf[BUFSIZE]; + + while ( (p = fgets(buf,BUFSIZE,stdin)) != NULL ) + { + c = *p++; + if ( *p++ != '=' ) + continue; + switch ( c ) + { + case 'G': + parse_global(p); + break; + case 'T': + parse_time_stats(p); + break; + case 'C': + parse_client(p); + break; + case 'P': + parse_pool(p); + break; + case 'S': + parse_shared_pool(p); + break; + default: + continue; + } + } + return 0; +} diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c index 75a19d3d6b..5a0bf1807c 100644 --- a/tools/python/xen/lowlevel/xc/xc.c +++ b/tools/python/xen/lowlevel/xc/xc.c @@ -19,6 +19,7 @@ #include "xenctrl.h" #include <xen/elfnote.h> +#include <xen/tmem.h> #include "xc_dom.h" #include <xen/hvm/hvm_info_table.h> #include <xen/hvm/params.h> @@ -1506,6 +1507,50 @@ static PyObject *dom_op(XcObject *self, PyObject *args, return zero; } +static PyObject *pyxc_tmem_control(XcObject *self, + PyObject *args, + PyObject *kwds) +{ + int32_t pool_id; + uint32_t subop; + uint32_t cli_id; + uint32_t arg1; + uint32_t arg2; + char *buf; + char _buffer[32768], *buffer = _buffer; + int rc; + + static char *kwd_list[] = { "pool_id", "subop", "cli_id", "arg1", "arg2", "buf", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiiis", kwd_list, + &pool_id, &subop, &cli_id, &arg1, &arg2, &buf) ) + return NULL; + + if ( (subop == TMEMC_LIST) && (arg1 > 32768) ) + arg1 = 32768; + + if ( (rc = xc_tmem_control(self->xc_handle, pool_id, subop, cli_id, arg1, arg2, buffer)) < 0 ) + return Py_BuildValue("i", rc); + + switch (subop) { + case TMEMC_LIST: + return Py_BuildValue("s", buffer); + case TMEMC_FLUSH: + return Py_BuildValue("i", rc); + case TMEMC_THAW: + case TMEMC_FREEZE: + case TMEMC_DESTROY: + case TMEMC_SET_WEIGHT: + case TMEMC_SET_CAP: + case TMEMC_SET_COMPRESS: + default: + break; + } + + Py_INCREF(zero); + return zero; +} + static PyMethodDef pyxc_methods[] = { { "handle", (PyCFunction)pyxc_handle, @@ -1965,6 +2010,18 @@ static PyMethodDef pyxc_methods[] = { " dom [int]: Identifier of domain.\n" }, #endif + { "tmem_control", + (PyCFunction)pyxc_tmem_control, + METH_VARARGS | METH_KEYWORDS, "\n" + "Do various control on a tmem pool.\n" + " pool_id [int]: Identifier of the tmem pool (-1 == all).\n" + " subop [int]: Supplementary Operation.\n" + " cli_id [int]: Client identifier (-1 == all).\n" + " arg1 [int]: Argument.\n" + " arg2 [int]: Argument.\n" + " buf [str]: Buffer.\n\n" + "Returns: [int] 0 or [str] tmem info on success; exception on error.\n" }, + { NULL, NULL, 0, NULL } }; diff --git a/tools/python/xen/xend/XendAPI.py b/tools/python/xen/xend/XendAPI.py index 6dda3a9c79..126db6421c 100644 --- a/tools/python/xen/xend/XendAPI.py +++ b/tools/python/xen/xend/XendAPI.py @@ -925,7 +925,15 @@ class XendAPI(object): ('dmesg', 'String'), ('dmesg_clear', 'String'), ('get_log', 'String'), - ('send_debug_keys', None)] + ('send_debug_keys', None), + ('tmem_thaw', None), + ('tmem_freeze', None), + ('tmem_flush', None), + ('tmem_destroy', None), + ('tmem_list', None), + ('tmem_set_weight', None), + ('tmem_set_cap', None), + ('tmem_set_compress', None)] host_funcs = [('get_by_name_label', None), ('list_methods', None)] @@ -1061,6 +1069,70 @@ class XendAPI(object): 'PSCSIs': XendPSCSI.get_all()} return xen_api_success(record) + def host_tmem_thaw(self, _, host_ref, cli_id): + node = XendNode.instance() + try: + node.tmem_thaw(cli_id) + except Exception, e: + return xen_api_error(e) + return xen_api_success_void() + + def host_tmem_freeze(self, _, host_ref, cli_id): + node = XendNode.instance() + try: + node.tmem_freeze(cli_id) + except Exception, e: + return xen_api_error(e) + return xen_api_success_void() + + def host_tmem_flush(self, _, host_ref, cli_id, pages): + node = XendNode.instance() + try: + node.tmem_flush(cli_id, pages) + except Exception, e: + return xen_api_error(e) + return xen_api_success_void() + + def host_tmem_destroy(self, _, host_ref, cli_id): + node = XendNode.instance() + try: + node.tmem_destroy(cli_id) + except Exception, e: + return xen_api_error(e) + return xen_api_success_void() + + def host_tmem_list(self, _, host_ref, cli_id, use_long): + node = XendNode.instance() + try: + info = node.tmem_list(cli_id, use_long) + except Exception, e: + return xen_api_error(e) + return xen_api_success(info) + + def host_tmem_set_weight(self, _, host_ref, cli_id, value): + node = XendNode.instance() + try: + node.tmem_set_weight(cli_id, value) + except Exception, e: + return xen_api_error(e) + return xen_api_success_void() + + def host_tmem_set_cap(self, _, host_ref, cli_id, value): + node = XendNode.instance() + try: + node.tmem_set_cap(cli_id, value) + except Exception, e: + return xen_api_error(e) + return xen_api_success_void() + + def host_tmem_set_compress(self, _, host_ref, cli_id, value): + node = XendNode.instance() + try: + node.tmem_set_compress(cli_id, value) + except Exception, e: + return xen_api_error(e) + return xen_api_success_void() + # class methods def host_get_all(self, session): return xen_api_success((XendNode.instance().uuid,)) diff --git a/tools/python/xen/xend/XendConstants.py b/tools/python/xen/xend/XendConstants.py index b0a7c66abb..c25ba2935a 100644 --- a/tools/python/xen/xend/XendConstants.py +++ b/tools/python/xen/xend/XendConstants.py @@ -141,3 +141,29 @@ XS_VMROOT = "/vm/" NR_PCI_DEV = 32 AUTO_PHP_SLOT = NR_PCI_DEV AUTO_PHP_SLOT_STR = "%02x" % NR_PCI_DEV + +# +# tmem +# + +TMEM_CONTROL = 0 +TMEM_NEW_POOL = 1 +TMEM_DESTROY_POOL = 2 +TMEM_NEW_PAGE = 3 +TMEM_PUT_PAGE = 4 +TMEM_GET_PAGE = 5 +TMEM_FLUSH_PAGE = 6 +TMEM_FLUSH_OBJECT = 7 +TMEM_READ = 8 +TMEM_WRITE = 9 +TMEM_XCHG = 10 + +TMEMC_THAW = 0 +TMEMC_FREEZE = 1 +TMEMC_FLUSH = 2 +TMEMC_DESTROY = 3 +TMEMC_LIST = 4 +TMEMC_SET_WEIGHT = 5 +TMEMC_SET_CAP = 6 +TMEMC_SET_COMPRESS = 7 + diff --git a/tools/python/xen/xend/XendNode.py b/tools/python/xen/xend/XendNode.py index d1c4055ba4..34682b90aa 100644 --- a/tools/python/xen/xend/XendNode.py +++ b/tools/python/xen/xend/XendNode.py @@ -26,6 +26,7 @@ from xen.util import pci as PciUtil from xen.util import vscsi_util from xen.xend import XendAPIStore from xen.xend import osdep +from xen.xend.XendConstants import * import uuid, arch from XendPBD import XendPBD @@ -940,6 +941,69 @@ class XendNode: def info_dict(self): return dict(self.info()) + # tmem + def tmem_list(self, cli_id, use_long): + pool_id = -1 + subop = TMEMC_LIST + arg1 = 32768 + arg2 = use_long + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def tmem_thaw(self, cli_id): + pool_id = -1 + subop = TMEMC_THAW + arg1 = 0 + arg2 = 0 + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def tmem_freeze(self, cli_id): + pool_id = -1 + subop = TMEMC_FREEZE + arg1 = 0 + arg2 = 0 + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def tmem_flush(self, cli_id, pages): + pool_id = -1 + subop = TMEMC_FLUSH + arg1 = pages + arg2 = 0 + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def tmem_destroy(self, cli_id): + pool_id = -1 + subop = TMEMC_DESTROY + arg1 = 0 + arg2 = 0 + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def tmem_set_weight(self, cli_id, arg1): + pool_id = -1 + subop = TMEMC_SET_WEIGHT + arg2 = 0 + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def tmem_set_cap(self, cli_id, arg1): + pool_id = -1 + subop = TMEMC_SET_CAP + arg2 = 0 + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def tmem_set_compress(self, cli_id, arg1): + pool_id = -1 + subop = TMEMC_SET_COMPRESS + arg2 = 0 + buf = '' + return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf) + + def instance(): global inst try: diff --git a/tools/python/xen/xend/balloon.py b/tools/python/xen/xend/balloon.py index b31398c745..42c8ea0aa7 100644 --- a/tools/python/xen/xend/balloon.py +++ b/tools/python/xen/xend/balloon.py @@ -26,6 +26,7 @@ import XendOptions from XendLogging import log from XendError import VmError import osdep +from xen.xend.XendConstants import * RETRY_LIMIT = 20 RETRY_LIMIT_INCR = 5 @@ -109,6 +110,9 @@ def free(need_mem, dominfo): last_free = None rlimit = RETRY_LIMIT + # stop tmem from absorbing any more memory (must THAW when done!) + xc.tmem_control(0,TMEMC_FREEZE,-1, 0, 0, "") + # If unreasonable memory size is required, we give up waiting # for ballooning or scrubbing, as if had retried. physinfo = xc.physinfo() @@ -122,6 +126,17 @@ def free(need_mem, dominfo): if need_mem >= max_free_mem: retries = rlimit + freeable_mem = free_mem + scrub_mem + if freeable_mem < need_mem and need_mem < max_free_mem: + # flush memory from tmem to scrub_mem and reobtain physinfo + need_tmem_kb = need_mem - freeable_mem + tmem_kb = xc.tmem_control(0,TMEMC_FLUSH,-1, need_tmem_kb, 0, "") + log.debug("Balloon: tmem relinquished %d KiB of %d KiB requested.", + tmem_kb, need_tmem_kb) + physinfo = xc.physinfo() + free_mem = physinfo['free_memory'] + scrub_mem = physinfo['scrub_memory'] + # Check whethercurrent machine is a numa system and the new # created hvm has all its vcpus in the same node, if all the # conditions above are fit. We will wait until all the pages @@ -216,4 +231,6 @@ def free(need_mem, dominfo): ' be shrunk any further')) finally: + # allow tmem to accept pages again + xc.tmem_control(0,TMEMC_THAW,-1, 0, 0, "") del xc diff --git a/tools/python/xen/xend/server/XMLRPCServer.py b/tools/python/xen/xend/server/XMLRPCServer.py index fb9bdfee34..93c6caef1b 100644 --- a/tools/python/xen/xend/server/XMLRPCServer.py +++ b/tools/python/xen/xend/server/XMLRPCServer.py @@ -198,7 +198,11 @@ class XMLRPCServer: self.server.register_function(fn, "xend.domain.%s" % name[7:]) # Functions in XendNode and XendDmesg - for type, lst, n in [(XendNode, ['info', 'pciinfo', 'send_debug_keys'], + for type, lst, n in [(XendNode, + ['info', 'pciinfo', 'send_debug_keys', + 'tmem_list', 'tmem_freeze', 'tmem_thaw', + 'tmem_flush', 'tmem_destroy', 'tmem_set_weight', + 'tmem_set_cap', 'tmem_set_compress'], 'node'), (XendDmesg, ['info', 'clear'], 'node.dmesg')]: inst = type.instance() diff --git a/tools/python/xen/xm/main.py b/tools/python/xen/xm/main.py index b7897b248a..e55fab42e5 100644 --- a/tools/python/xen/xm/main.py +++ b/tools/python/xen/xm/main.py @@ -199,6 +199,15 @@ SUBCOMMAND_HELP = { 'scsi-list' : ('<Domain> [--long]', 'List all SCSI devices currently attached.'), + # tmem + 'tmem-list' : ('[-l|--long] [<Domain>|-a|--all]', 'List tmem pools.'), + 'tmem-thaw' : ('[<Domain>|-a|--all]', 'Thaw tmem pools.'), + 'tmem-freeze' : ('[<Domain>|-a|--all]', 'Freeze tmem pools.'), + 'tmem-destroy' : ('[<Domain>|-a|--all]', 'Destroy tmem pools.'), + 'tmem-set' : ('[<Domain>|-a|--all] [weight=<weight>] [cap=<cap>] ' + '[compress=<compress>]', + 'Change tmem settings.'), + # security 'addlabel' : ('<label> {dom <ConfigFile>|res <resource>|mgt <managed domain>}\n' @@ -283,6 +292,21 @@ SUBCOMMAND_OPTIONS = { 'info': ( ('-c', '--config', 'List Xend configuration parameters'), ), + 'tmem-list': ( + ('-l', '--long', 'List tmem stats.'), + ), + 'tmem-thaw': ( + ('-a', '--all', 'Thaw all tmem.'), + ), + 'tmem-freeze': ( + ('-a', '--all', 'Freeze all tmem.'), + ), + 'tmem-destroy': ( + ('-a', '--all', 'Destroy all tmem.'), + ), + 'tmem-set': ( + ('-a', '--all', 'Operate on all tmem.'), + ), } common_commands = [ @@ -397,9 +421,17 @@ acm_commands = [ "getpolicy", ] +tmem_commands = [ + "tmem-list", + "tmem-thaw", + "tmem-freeze", + "tmem-destroy", + "tmem-set", + ] + all_commands = (domain_commands + host_commands + scheduler_commands + device_commands + vnet_commands + acm_commands + - ['shell', 'event-monitor']) + tmem_commands + ['shell', 'event-monitor']) ## @@ -2837,7 +2869,188 @@ def xm_network_show(args): print format2 % r - +def xm_tmem_list(args): + try: + (options, params) = getopt.gnu_getopt(args, 'la', ['long','all']) + except getopt.GetoptError, opterr: + err(opterr) + usage('tmem-list') + + use_long = False + for (k, v) in options: + if k in ['-l', '--long']: + use_long = True + + all = False + for (k, v) in options: + if k in ['-a', '--all']: + all = True + + if not all and len(params) == 0: + err('You must specify -a or --all or a domain id.') + usage('tmem-list') + + if all: + domid = -1 + else: + try: + domid = int(params[0]) + params = params[1:] + except: + err('Unrecognized domain id: %s' % params[0]) + usage('tmem-list') + + if serverType == SERVER_XEN_API: + print server.xenapi.host.tmem_list(domid,use_long) + else: + print server.xend.node.tmem_list(domid,use_long) + +def parse_tmem_args(args, name): + try: + (options, params) = getopt.gnu_getopt(args, 'a', ['all']) + except getopt.GetoptError, opterr: + err(opterr) + usage(name) + + all = False + for (k, v) in options: + if k in ['-a', '--all']: + all = True + + if not all and len(params) == 0: + err('You must specify -a or --all or a domain id.') + usage(name) + + if all: + domid = -1 + else: + try: + domid = int(params[0]) + params = params[1:] + except: + err('Unrecognized domain id: %s' % params[0]) + usage(name) + + return domid, params + +def xm_tmem_destroy(args): + (domid, _) = parse_tmem_args(args, 'tmem-destroy') + if serverType == SERVER_XEN_API: + server.xenapi.host.tmem_destroy(domid) + else: + server.xend.node.tmem_destroy(domid) + +def xm_tmem_thaw(args): + (domid, _) = parse_tmem_args(args, 'tmem-thaw') + if serverType == SERVER_XEN_API: + server.xenapi.host.tmem_thaw(domid) + else: + server.xend.node.tmem_thaw(domid) + +def xm_tmem_freeze(args): + (domid, _) = parse_tmem_args(args, 'tmem-freeze') + if serverType == SERVER_XEN_API: + server.xenapi.host.tmem_freeze(domid) + else: + server.xend.node.tmem_freeze(domid) + +def xm_tmem_flush(args): + try: + (options, params) = getopt.gnu_getopt(args, 'a', ['all']) + except getopt.GetoptError, opterr: + err(opterr) + usage(name) + + all = False + for (k, v) in options: + if k in ['-a', '--all']: + all = True + + if not all and len(params) == 0: + err('You must specify -a or --all or a domain id.') + usage('tmem-flush') + + if all: + domid = -1 + else: + try: + domid = int(params[0]) + params = params[1:] + except: + err('Unrecognized domain id: %s' % params[0]) + usage('tmem-flush') + + pages = -1 + for (k, v) in options: + if k in ['-p', '--pages']: + pages = v + + if serverType == SERVER_XEN_API: + server.xenapi.host.tmem_flush(domid,pages) + else: + server.xend.node.tmem_flush(domid,pages) + +def xm_tmem_set(args): + try: + (options, params) = getopt.gnu_getopt(args, 'a', ['all']) + except getopt.GetoptError, opterr: + err(opterr) + usage(name) + + all = False + for (k, v) in options: + if k in ['-a', '--all']: + all = True + + if not all and len(params) == 0: + err('You must specify -a or --all or a domain id.') + usage('tmem-set') + + if all: + domid = -1 + else: + try: + domid = int(params[0]) + params = params[1:] + except: + err('Unrecognized domain id: %s' % params[0]) + usage('tmem-set') + + weight = None + cap = None + compress = None + for item in params: + if item.startswith('weight='): + try: + weight = int(item[7:]) + except: + err('weight should be a integer') + usage('tmem-set') + if item.startswith('cap='): + cap = int(item[4:]) + if item.startswith('compress='): + compress = int(item[9:]) + + if weight is None and cap is None and compress is None: + err('Unrecognized tmem configuration option: %s' % item) + usage('tmem-set') + + if serverType == SERVER_XEN_API: + if weight is not None: + server.xenapi.host.tmem_set_weight(domid, weight) + if cap is not None: + server.xenapi.host.tmem_set_cap(domid, cap) + if compress is not None: + server.xenapi.host.tmem_set_compress(domid, compress) + else: + if weight is not None: + server.xend.node.tmem_set_weight(domid, weight) + if cap is not None: + server.xend.node.tmem_set_cap(domid, cap) + if compress is not None: + server.xend.node.tmem_set_compress(domid, compress) + + commands = { "shell": xm_shell, "event-monitor": xm_event_monitor, @@ -2912,6 +3125,13 @@ commands = { "scsi-attach": xm_scsi_attach, "scsi-detach": xm_scsi_detach, "scsi-list": xm_scsi_list, + # tmem + "tmem-thaw": xm_tmem_thaw, + "tmem-freeze": xm_tmem_freeze, + "tmem-flush": xm_tmem_flush, + "tmem-destroy": xm_tmem_destroy, + "tmem-list": xm_tmem_list, + "tmem-set": xm_tmem_set, } ## The commands supported by a separate argument parser in xend.xm. diff --git a/xen/arch/ia64/xen/mm.c b/xen/arch/ia64/xen/mm.c index c98272a0b3..20071061f2 100644 --- a/xen/arch/ia64/xen/mm.c +++ b/xen/arch/ia64/xen/mm.c @@ -2870,6 +2870,13 @@ steal_page(struct domain *d, struct page_info *page, unsigned int memflags) return -1; } +int +donate_page(struct domain *d, struct page_info *page, unsigned int memflags) +{ + /* needs to be implemented for transcendent memory (tmem) */ + ASSERT(0); +} + static void __guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 113b8dd4de..1f4199d55f 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -3539,6 +3539,42 @@ int replace_grant_host_mapping( return rc; } +int donate_page( + struct domain *d, struct page_info *page, unsigned int memflags) +{ + spin_lock(&d->page_alloc_lock); + + if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) ) + goto fail; + + if ( d->is_dying ) + goto fail; + + if ( page->count_info & ~(PGC_allocated | 1) ) + goto fail; + + if ( !(memflags & MEMF_no_refcount) ) + { + if ( d->tot_pages >= d->max_pages ) + goto fail; + d->tot_pages++; + } + + page->count_info = PGC_allocated | 1; + page_set_owner(page, d); + page_list_add_tail(page,&d->page_list); + + spin_unlock(&d->page_alloc_lock); + return 0; + + fail: + spin_unlock(&d->page_alloc_lock); + MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info, + (void *)page_to_mfn(page), d, d->domain_id, + page_get_owner(page), page->count_info, page->u.inuse.type_info); + return -1; +} + int steal_page( struct domain *d, struct page_info *page, unsigned int memflags) { diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index d87d0827aa..026996ec09 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -110,6 +110,7 @@ extern void early_time_init(void); extern void early_cpu_init(void); extern void vesa_init(void); extern void vesa_mtrr_init(void); +extern void init_tmem(void); DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table; #ifdef CONFIG_COMPAT @@ -1063,6 +1064,8 @@ void __init __start_xen(unsigned long mbi_p) init_trace_bufs(); + init_tmem(); + console_endboot(); /* Hide UART from DOM0 if we're using it */ diff --git a/xen/common/Makefile b/xen/common/Makefile index 3054f2e271..08b9e2b00e 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -28,6 +28,11 @@ obj-y += version.o obj-y += vsprintf.o obj-y += xmalloc_tlsf.o obj-y += rcupdate.o +obj-y += tmem.o +obj-y += tmem_xen.o +obj-y += radix-tree.o +obj-y += rbtree.o +obj-y += lzo.o obj-$(perfc) += perfc.o obj-$(crash_debug) += gdbstub.o diff --git a/xen/common/compat/Makefile b/xen/common/compat/Makefile index 9a36a3dcd3..1cf289ab3e 100644 --- a/xen/common/compat/Makefile +++ b/xen/common/compat/Makefile @@ -3,3 +3,4 @@ obj-y += kernel.o obj-y += memory.o obj-y += multicall.o obj-y += xlat.o +obj-y += tmem_xen.o diff --git a/xen/common/compat/tmem_xen.c b/xen/common/compat/tmem_xen.c new file mode 100644 index 0000000000..f6c9e0453d --- /dev/null +++ b/xen/common/compat/tmem_xen.c @@ -0,0 +1,26 @@ +/****************************************************************************** + * tmem_xen.c + * + */ + +#include <xen/config.h> +#include <xen/lib.h> +#include <xen/sched.h> +#include <xen/domain.h> +#include <xen/guest_access.h> +#include <xen/hypercall.h> +#include <compat/tmem.h> + +#define xen_tmem_op tmem_op +/*CHECK_tmem_op;*/ +#undef xen_tmem_op + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/domain.c b/xen/common/domain.c index 187735b18c..66694168a2 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -31,6 +31,7 @@ #include <public/vcpu.h> #include <xsm/xsm.h> #include <xen/trace.h> +#include <xen/tmem.h> /* Linux config option: propageted to domain0 */ /* xen_processor_pmbits: xen control Cx, Px, ... */ @@ -558,6 +559,9 @@ static void complete_domain_destroy(struct rcu_head *head) grant_table_destroy(d); + if ( d->tmem != NULL ) + tmem_destroy(d->tmem); + arch_domain_destroy(d); rangeset_domain_destroy(d); diff --git a/xen/common/lzo.c b/xen/common/lzo.c new file mode 100644 index 0000000000..eeb200b281 --- /dev/null +++ b/xen/common/lzo.c @@ -0,0 +1,518 @@ +/* + * lzo.c -- LZO1X Compressor from MiniLZO + * + * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com> + * + * The full LZO package can be found at: + * http://www.oberhumer.com/opensource/lzo/ + * + * Adapted for Xen (files combined and syntactic/header changes) by: + * Dan Magenheimer <dan.magenheimer@oracle.com> + * + */ + +/* + * lzodefs.h -- architecture, OS and compiler specific defines + * + * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com> + * + * The full LZO package can be found at: + * http://www.oberhumer.com/opensource/lzo/ + * + * Changed for kernel use by: + * Nitin Gupta <nitingupta910@gmail.com> + * Richard Purdie <rpurdie@openedhand.com> + */ + +#define LZO_VERSION 0x2020 +#define LZO_VERSION_STRING "2.02" +#define LZO_VERSION_DATE "Oct 17 2005" + +#define M1_MAX_OFFSET 0x0400 +#define M2_MAX_OFFSET 0x0800 +#define M3_MAX_OFFSET 0x4000 +#define M4_MAX_OFFSET 0xbfff + +#define M1_MIN_LEN 2 +#define M1_MAX_LEN 2 +#define M2_MIN_LEN 3 +#define M2_MAX_LEN 8 +#define M3_MIN_LEN 3 +#define M3_MAX_LEN 33 +#define M4_MIN_LEN 3 +#define M4_MAX_LEN 9 + +#define M1_MARKER 0 +#define M2_MARKER 64 +#define M3_MARKER 32 +#define M4_MARKER 16 + +#define D_BITS 14 +#define D_MASK ((1u << D_BITS) - 1) +#define D_HIGH ((D_MASK >> 1) + 1) + +#define DX2(p, s1, s2) (((((size_t)((p)[2]) << (s2)) ^ (p)[1]) \ + << (s1)) ^ (p)[0]) +#define DX3(p, s1, s2, s3) ((DX2((p)+1, s2, s3) << (s1)) ^ (p)[0]) + +/* + * LZO1X Compressor from MiniLZO + * + * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com> + * + * The full LZO package can be found at: + * http://www.oberhumer.com/opensource/lzo/ + * + * Changed for kernel use by: + * Nitin Gupta <nitingupta910@gmail.com> + * Richard Purdie <rpurdie@openedhand.com> + */ + +#include <xen/types.h> +#include <xen/lzo.h> +#define get_unaligned(_p) (*(_p)) +#define put_unaligned(_val,_p) (*(_p)=_val) +#define get_unaligned_le16(_p) (*(u16 *)(_p)) + +static noinline size_t +_lzo1x_1_do_compress(const unsigned char *in, size_t in_len, + unsigned char *out, size_t *out_len, void *wrkmem) +{ + const unsigned char * const in_end = in + in_len; + const unsigned char * const ip_end = in + in_len - M2_MAX_LEN - 5; + const unsigned char ** const dict = wrkmem; + const unsigned char *ip = in, *ii = ip; + const unsigned char *end, *m, *m_pos; + size_t m_off, m_len, dindex; + unsigned char *op = out; + + ip += 4; + + for (;;) { + dindex = ((size_t)(0x21 * DX3(ip, 5, 5, 6)) >> 5) & D_MASK; + m_pos = dict[dindex]; + + if (m_pos < in) + goto literal; + + if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET)) + goto literal; + + m_off = ip - m_pos; + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3]) + goto try_match; + + dindex = (dindex & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f); + m_pos = dict[dindex]; + + if (m_pos < in) + goto literal; + + if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET)) + goto literal; + + m_off = ip - m_pos; + if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3]) + goto try_match; + + goto literal; + + try_match: + if (get_unaligned((const unsigned short *)m_pos) + == get_unaligned((const unsigned short *)ip)) { + if (likely(m_pos[2] == ip[2])) + goto match; + } + + literal: + dict[dindex] = ip; + ++ip; + if (unlikely(ip >= ip_end)) + break; + continue; + + match: + dict[dindex] = ip; + if (ip != ii) { + size_t t = ip - ii; + + if (t <= 3) { + op[-2] |= t; + } else if (t <= 18) { + *op++ = (t - 3); + } else { + size_t tt = t - 18; + + *op++ = 0; + while (tt > 255) { + tt -= 255; + *op++ = 0; + } + *op++ = tt; + } + do { + *op++ = *ii++; + } while (--t > 0); + } + + ip += 3; + if (m_pos[3] != *ip++ || m_pos[4] != *ip++ + || m_pos[5] != *ip++ || m_pos[6] != *ip++ + || m_pos[7] != *ip++ || m_pos[8] != *ip++) { + --ip; + m_len = ip - ii; + + if (m_off <= M2_MAX_OFFSET) { + m_off -= 1; + *op++ = (((m_len - 1) << 5) + | ((m_off & 7) << 2)); + *op++ = (m_off >> 3); + } else if (m_off <= M3_MAX_OFFSET) { + m_off -= 1; + *op++ = (M3_MARKER | (m_len - 2)); + goto m3_m4_offset; + } else { + m_off -= 0x4000; + + *op++ = (M4_MARKER | ((m_off & 0x4000) >> 11) + | (m_len - 2)); + goto m3_m4_offset; + } + } else { + end = in_end; + m = m_pos + M2_MAX_LEN + 1; + + while (ip < end && *m == *ip) { + m++; + ip++; + } + m_len = ip - ii; + + if (m_off <= M3_MAX_OFFSET) { + m_off -= 1; + if (m_len <= 33) { + *op++ = (M3_MARKER | (m_len - 2)); + } else { + m_len -= 33; + *op++ = M3_MARKER | 0; + goto m3_m4_len; + } + } else { + m_off -= 0x4000; + if (m_len <= M4_MAX_LEN) { + *op++ = (M4_MARKER + | ((m_off & 0x4000) >> 11) + | (m_len - 2)); + } else { + m_len -= M4_MAX_LEN; + *op++ = (M4_MARKER + | ((m_off & 0x4000) >> 11)); + m3_m4_len: + while (m_len > 255) { + m_len -= 255; + *op++ = 0; + } + + *op++ = (m_len); + } + } + m3_m4_offset: + *op++ = ((m_off & 63) << 2); + *op++ = (m_off >> 6); + } + + ii = ip; + if (unlikely(ip >= ip_end)) + break; + } + + *out_len = op - out; + return in_end - ii; +} + +int lzo1x_1_compress(const unsigned char *in, size_t in_len, unsigned char *out, + size_t *out_len, void *wrkmem) +{ + const unsigned char *ii; + unsigned char *op = out; + size_t t; + + if (unlikely(in_len <= M2_MAX_LEN + 5)) { + t = in_len; + } else { + t = _lzo1x_1_do_compress(in, in_len, op, out_len, wrkmem); + op += *out_len; + } + + if (t > 0) { + ii = in + in_len - t; + + if (op == out && t <= 238) { + *op++ = (17 + t); + } else if (t <= 3) { + op[-2] |= t; + } else if (t <= 18) { + *op++ = (t - 3); + } else { + size_t tt = t - 18; + + *op++ = 0; + while (tt > 255) { + tt -= 255; + *op++ = 0; + } + + *op++ = tt; + } + do { + *op++ = *ii++; + } while (--t > 0); + } + + *op++ = M4_MARKER | 1; + *op++ = 0; + *op++ = 0; + + *out_len = op - out; + return LZO_E_OK; +} + +/* + * LZO1X Decompressor from MiniLZO + * + * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com> + * + * The full LZO package can be found at: + * http://www.oberhumer.com/opensource/lzo/ + * + * Changed for kernel use by: + * Nitin Gupta <nitingupta910@gmail.com> + * Richard Purdie <rpurdie@openedhand.com> + */ + +#define HAVE_IP(x, ip_end, ip) ((size_t)(ip_end - ip) < (x)) +#define HAVE_OP(x, op_end, op) ((size_t)(op_end - op) < (x)) +#define HAVE_LB(m_pos, out, op) (m_pos < out || m_pos >= op) + +#define COPY4(dst, src) \ + put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst)) + +int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, + unsigned char *out, size_t *out_len) +{ + const unsigned char * const ip_end = in + in_len; + unsigned char * const op_end = out + *out_len; + const unsigned char *ip = in, *m_pos; + unsigned char *op = out; + size_t t; + + *out_len = 0; + + if (*ip > 17) { + t = *ip++ - 17; + if (t < 4) + goto match_next; + if (HAVE_OP(t, op_end, op)) + goto output_overrun; + if (HAVE_IP(t + 1, ip_end, ip)) + goto input_overrun; + do { + *op++ = *ip++; + } while (--t > 0); + goto first_literal_run; + } + + while ((ip < ip_end)) { + t = *ip++; + if (t >= 16) + goto match; + if (t == 0) { + if (HAVE_IP(1, ip_end, ip)) + goto input_overrun; + while (*ip == 0) { + t += 255; + ip++; + if (HAVE_IP(1, ip_end, ip)) + goto input_overrun; + } + t += 15 + *ip++; + } + if (HAVE_OP(t + 3, op_end, op)) + goto output_overrun; + if (HAVE_IP(t + 4, ip_end, ip)) + goto input_overrun; + + COPY4(op, ip); + op += 4; + ip += 4; + if (--t > 0) { + if (t >= 4) { + do { + COPY4(op, ip); + op += 4; + ip += 4; + t -= 4; + } while (t >= 4); + if (t > 0) { + do { + *op++ = *ip++; + } while (--t > 0); + } + } else { + do { + *op++ = *ip++; + } while (--t > 0); + } + } + + first_literal_run: + t = *ip++; + if (t >= 16) + goto match; + m_pos = op - (1 + M2_MAX_OFFSET); + m_pos -= t >> 2; + m_pos -= *ip++ << 2; + + if (HAVE_LB(m_pos, out, op)) + goto lookbehind_overrun; + + if (HAVE_OP(3, op_end, op)) + goto output_overrun; + *op++ = *m_pos++; + *op++ = *m_pos++; + *op++ = *m_pos; + + goto match_done; + + do { + match: + if (t >= 64) { + m_pos = op - 1; + m_pos -= (t >> 2) & 7; + m_pos -= *ip++ << 3; + t = (t >> 5) - 1; + if (HAVE_LB(m_pos, out, op)) + goto lookbehind_overrun; + if (HAVE_OP(t + 3 - 1, op_end, op)) + goto output_overrun; + goto copy_match; + } else if (t >= 32) { + t &= 31; + if (t == 0) { + if (HAVE_IP(1, ip_end, ip)) + goto input_overrun; + while (*ip == 0) { + t += 255; + ip++; + if (HAVE_IP(1, ip_end, ip)) + goto input_overrun; + } + t += 31 + *ip++; + } + m_pos = op - 1; + m_pos -= get_unaligned_le16(ip) >> 2; + ip += 2; + } else if (t >= 16) { + m_pos = op; + m_pos -= (t & 8) << 11; + + t &= 7; + if (t == 0) { + if (HAVE_IP(1, ip_end, ip)) + goto input_overrun; + while (*ip == 0) { + t += 255; + ip++; + if (HAVE_IP(1, ip_end, ip)) + goto input_overrun; + } + t += 7 + *ip++; + } + m_pos -= get_unaligned_le16(ip) >> 2; + ip += 2; + if (m_pos == op) + goto eof_found; + m_pos -= 0x4000; + } else { + m_pos = op - 1; + m_pos -= t >> 2; + m_pos -= *ip++ << 2; + + if (HAVE_LB(m_pos, out, op)) + goto lookbehind_overrun; + if (HAVE_OP(2, op_end, op)) + goto output_overrun; + + *op++ = *m_pos++; + *op++ = *m_pos; + goto match_done; + } + + if (HAVE_LB(m_pos, out, op)) + goto lookbehind_overrun; + if (HAVE_OP(t + 3 - 1, op_end, op)) + goto output_overrun; + + if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) { + COPY4(op, m_pos); + op += 4; + m_pos += 4; + t -= 4 - (3 - 1); + do { + COPY4(op, m_pos); + op += 4; + m_pos += 4; + t -= 4; + } while (t >= 4); + if (t > 0) + do { + *op++ = *m_pos++; + } while (--t > 0); + } else { + copy_match: + *op++ = *m_pos++; + *op++ = *m_pos++; + do { + *op++ = *m_pos++; + } while (--t > 0); + } + match_done: + t = ip[-2] & 3; + if (t == 0) + break; + match_next: + if (HAVE_OP(t, op_end, op)) + goto output_overrun; + if (HAVE_IP(t + 1, ip_end, ip)) + goto input_overrun; + + *op++ = *ip++; + if (t > 1) { + *op++ = *ip++; + if (t > 2) + *op++ = *ip++; + } + + t = *ip++; + } while (ip < ip_end); + } + + *out_len = op - out; + return LZO_E_EOF_NOT_FOUND; + + eof_found: + *out_len = op - out; + return (ip == ip_end ? LZO_E_OK : + (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN)); + input_overrun: + *out_len = op - out; + return LZO_E_INPUT_OVERRUN; + + output_overrun: + *out_len = op - out; + return LZO_E_OUTPUT_OVERRUN; + + lookbehind_overrun: + *out_len = op - out; + return LZO_E_LOOKBEHIND_OVERRUN; +} diff --git a/xen/common/memory.c b/xen/common/memory.c index 55e2d8a046..0dd2b9282f 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -560,17 +560,6 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg) return rc; } -/* Temporary placeholder. */ -int do_tmem_op(void *tmem_op) -{ - static bool_t warned; - - if ( !test_and_set_bool(warned) ) - printk("tmem: not implemented\n"); - - return -ENOSYS; -} - /* * Local variables: * mode: C diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index ab3445b44b..bb143aedd6 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -35,6 +35,7 @@ #include <xen/perfc.h> #include <xen/numa.h> #include <xen/nodemask.h> +#include <xen/tmem.h> #include <public/sysctl.h> #include <asm/page.h> #include <asm/numa.h> @@ -335,9 +336,9 @@ static unsigned long init_node_heap(int node, unsigned long mfn, /* Allocate 2^@order contiguous pages. */ static struct page_info *alloc_heap_pages( unsigned int zone_lo, unsigned int zone_hi, - unsigned int node, unsigned int order) + unsigned int node, unsigned int order, unsigned int memflags) { - unsigned int i, j, zone; + unsigned int i, j, zone = 0; unsigned int num_nodes = num_online_nodes(); unsigned long request = 1UL << order; cpumask_t extra_cpus_mask, mask; @@ -380,6 +381,14 @@ static struct page_info *alloc_heap_pages( node = 0; } + /* Try to free memory from tmem */ + if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL ) + { + /* reassigning an already allocated anonymous heap page */ + spin_unlock(&heap_lock); + return pg; + } + /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; @@ -1018,8 +1027,8 @@ void *alloc_xenheap_pages(unsigned int order, unsigned int memflags) ASSERT(!in_irq()); - pg = alloc_heap_pages( - MEMZONE_XEN, MEMZONE_XEN, cpu_to_node(smp_processor_id()), order); + pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN, + cpu_to_node(smp_processor_id()), order, memflags); if ( unlikely(pg == NULL) ) return NULL; @@ -1172,11 +1181,11 @@ struct page_info *alloc_domheap_pages( return NULL; if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) ) - pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order); + pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags); if ( (pg == NULL) && ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, - node, order)) == NULL) ) + node, order, memflags)) == NULL) ) return NULL; if ( (d != NULL) && assign_pages(d, pg, order, memflags) ) @@ -1373,6 +1382,28 @@ static void page_scrub_softirq(void) spin_unlock(&serialise_lock); } +void scrub_list_splice(struct page_list_head *list) +{ + spin_lock(&page_scrub_lock); + page_list_splice(list, &page_scrub_list); + spin_unlock(&page_scrub_lock); +} + +void scrub_list_add(struct page_info *pg) +{ + spin_lock(&page_scrub_lock); + page_list_add(pg, &page_scrub_list); + spin_unlock(&page_scrub_lock); +} + +void scrub_one_page(struct page_info *pg) +{ + void *p = map_domain_page(page_to_mfn(pg)); + + scrub_page(p); + unmap_domain_page(p); +} + static void page_scrub_timer_fn(void *unused) { page_scrub_schedule_work(); diff --git a/xen/common/radix-tree.c b/xen/common/radix-tree.c new file mode 100644 index 0000000000..414f0cef72 --- /dev/null +++ b/xen/common/radix-tree.c @@ -0,0 +1,448 @@ +/* + * Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * Copyright (C) 2009 adaption for Xen tmem by Dan Magenheimer, Oracle Corp. + * Changed: + * o Linux 2.6.18 source used (prior to read-copy-update addition) + * o constants and data structures moved out to radix-tree.h header + * o tagging code removed + * o radix_tree_insert has func parameter for dynamic data struct allocation + * o radix_tree_destroy added (including recursive helper function) + * o __init functions must be called explicitly + * o other include files adapted to Xen + */ + +#include <xen/config.h> +#include <xen/lib.h> +#include <xen/types.h> +#include <xen/errno.h> +#include <xen/radix-tree.h> +#include <asm/cache.h> + +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; + +/* + * Return the maximum key which can be store into a + * radix tree with height HEIGHT. + */ +static inline unsigned long radix_tree_maxindex(unsigned int height) +{ + return height_to_maxindex[height]; +} + +/* + * Extend a radix tree so it can store key @index. + */ +static int radix_tree_extend(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *(*node_alloc)(void *), void *arg) +{ + struct radix_tree_node *node; + unsigned int height; + + /* Figure out what the height should be. */ + height = root->height + 1; + if (index > radix_tree_maxindex(height)) + while (index > radix_tree_maxindex(height)) + height++; + + if (root->rnode == NULL) { + root->height = height; + goto out; + } + + do { + if (!(node = node_alloc(arg))) + return -ENOMEM; + + /* Increase the height. */ + node->slots[0] = root->rnode; + + node->count = 1; + root->rnode = node; + root->height++; + } while (height > root->height); + out: + return 0; +} + +/** + * radix_tree_insert - insert into a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Insert an item into the radix tree at position @index. + */ +int radix_tree_insert(struct radix_tree_root *root, unsigned long index, + void *item, struct radix_tree_node *(*node_alloc)(void *), void *arg) +{ + struct radix_tree_node *node = NULL, *slot; + unsigned int height, shift; + int offset; + int error; + + /* Make sure the tree is high enough. */ + if (index > radix_tree_maxindex(root->height)) { + error = radix_tree_extend(root, index, node_alloc, arg); + if (error) + return error; + } + + slot = root->rnode; + height = root->height; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + + offset = 0; /* uninitialised var warning */ + while (height > 0) { + if (slot == NULL) { + /* Have to add a child node. */ + if (!(slot = node_alloc(arg))) + return -ENOMEM; + if (node) { + + node->slots[offset] = slot; + node->count++; + } else + root->rnode = slot; + } + + /* Go a level down */ + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + node = slot; + slot = node->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + if (slot != NULL) + return -EEXIST; + + if (node) { + node->count++; + node->slots[offset] = item; + } else { + root->rnode = item; + } + + return 0; +} +EXPORT_SYMBOL(radix_tree_insert); + +static inline void **__lookup_slot(struct radix_tree_root *root, + unsigned long index) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + + height = root->height; + + if (index > radix_tree_maxindex(height)) + return NULL; + + if (height == 0 && root->rnode) + return (void **)&root->rnode; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + while (height > 0) { + if (*slot == NULL) + return NULL; + + slot = (struct radix_tree_node **) + ((*slot)->slots + + ((index >> shift) & RADIX_TREE_MAP_MASK)); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + return (void **)slot; +} + +/** + * radix_tree_lookup_slot - lookup a slot in a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the slot corresponding to the position @index in the radix tree + * @root. This is useful for update-if-exists operations. + */ +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) +{ + return __lookup_slot(root, index); +} +EXPORT_SYMBOL(radix_tree_lookup_slot); + +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + */ +void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +{ + void **slot; + + slot = __lookup_slot(root, index); + return slot != NULL ? *slot : NULL; +} +EXPORT_SYMBOL(radix_tree_lookup); + +static unsigned int +__lookup(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index) +{ + unsigned int nr_found = 0; + unsigned int shift, height; + struct radix_tree_node *slot; + unsigned long i; + + height = root->height; + if (index > radix_tree_maxindex(height)) + if (height == 0) { + if (root->rnode && index == 0) + results[nr_found++] = root->rnode; + goto out; + } + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + for ( ; height > 1; height--) { + + for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; + i < RADIX_TREE_MAP_SIZE; i++) { + if (slot->slots[i] != NULL) + break; + index &= ~((1UL << shift) - 1); + index += 1UL << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } + + /* Bottom level: grab some items */ + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { + index++; + if (slot->slots[i]) { + results[nr_found++] = slot->slots[i]; + if (nr_found == max_items) + goto out; + } + } + out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * them at *@results and returns the number of items which were placed at + * *@results. + * + * The implementation is naive. + */ +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup(root, results + ret, cur_index, + max_items - ret, &next_index); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} +EXPORT_SYMBOL(radix_tree_gang_lookup); + +/** + * radix_tree_shrink - shrink height of a radix tree to minimal + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root, + void (*node_free)(struct radix_tree_node *)) +{ + /* try to shrink tree height */ + while (root->height > 0 && + root->rnode->count == 1 && + root->rnode->slots[0]) { + struct radix_tree_node *to_free = root->rnode; + + root->rnode = to_free->slots[0]; + root->height--; + to_free->slots[0] = NULL; + to_free->count = 0; + node_free(to_free); + } +} + +/** + * radix_tree_delete - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * + * Remove the item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index, + void(*node_free)(struct radix_tree_node *)) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; + struct radix_tree_node *slot = NULL; + unsigned int height, shift; + int offset; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + slot = root->rnode; + if (height == 0 && root->rnode) { + root->rnode = NULL; + goto out; + } + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + + do { + if (slot == NULL) + goto out; + + pathp++; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp->offset = offset; + pathp->node = slot; + slot = slot->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } while (height > 0); + + if (slot == NULL) + goto out; + + /* Now free the nodes we do not need anymore */ + while (pathp->node) { + pathp->node->slots[pathp->offset] = NULL; + pathp->node->count--; + + if (pathp->node->count) { + if (pathp->node == root->rnode) + radix_tree_shrink(root, node_free); + goto out; + } + + /* Node with zero slots in use so free it */ + node_free(pathp->node); + + pathp--; + } + root->height = 0; + root->rnode = NULL; + + out: + return slot; +} +EXPORT_SYMBOL(radix_tree_delete); + +static void +radix_tree_node_destroy(struct radix_tree_node *node, unsigned int height, + void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *)) +{ + int i; + + if (height == 0) + return; + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + if (height == 1) { + slot_free(node->slots[i]); + node->slots[i] = NULL; + continue; + } + radix_tree_node_destroy(node->slots[i], height-1, + slot_free, node_free); + node_free(node->slots[i]); + node->slots[i] = NULL; + } + } +} + +void radix_tree_destroy(struct radix_tree_root *root, + void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *)) +{ + if (root->rnode == NULL) + return; + if (root->height == 0) + slot_free(root->rnode); + else { + radix_tree_node_destroy(root->rnode, root->height, + slot_free, node_free); + node_free(root->rnode); + root->height = 0; + } + root->rnode = NULL; + /* caller must delete root if desired */ +} +EXPORT_SYMBOL(radix_tree_destroy); + +static /*__init*/ unsigned long __maxindex(unsigned int height) +{ + unsigned int tmp = height * RADIX_TREE_MAP_SHIFT; + unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1; + + if (tmp >= RADIX_TREE_INDEX_BITS) + index = ~0UL; + return index; +} + +/*__init*/ void radix_tree_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); +} diff --git a/xen/common/rbtree.c b/xen/common/rbtree.c new file mode 100644 index 0000000000..67564c81b3 --- /dev/null +++ b/xen/common/rbtree.c @@ -0,0 +1,398 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli <andrea@suse.de> + (C) 2002 David Woodhouse <dwmw2@infradead.org> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c +*/ + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/rbtree.h> + +static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *right = node->rb_right; + struct rb_node *parent = rb_parent(node); + + if ((node->rb_right = right->rb_left)) + rb_set_parent(right->rb_left, node); + right->rb_left = node; + + rb_set_parent(right, parent); + + if (parent) + { + if (node == parent->rb_left) + parent->rb_left = right; + else + parent->rb_right = right; + } + else + root->rb_node = right; + rb_set_parent(node, right); +} + +static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *left = node->rb_left; + struct rb_node *parent = rb_parent(node); + + if ((node->rb_left = left->rb_right)) + rb_set_parent(left->rb_right, node); + left->rb_right = node; + + rb_set_parent(left, parent); + + if (parent) + { + if (node == parent->rb_right) + parent->rb_right = left; + else + parent->rb_left = left; + } + else + root->rb_node = left; + rb_set_parent(node, left); +} + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *parent, *gparent; + + while ((parent = rb_parent(node)) && rb_is_red(parent)) + { + gparent = rb_parent(parent); + + if (parent == gparent->rb_left) + { + { + register struct rb_node *uncle = gparent->rb_right; + if (uncle && rb_is_red(uncle)) + { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_right == node) + { + register struct rb_node *tmp; + __rb_rotate_left(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_right(gparent, root); + } else { + { + register struct rb_node *uncle = gparent->rb_left; + if (uncle && rb_is_red(uncle)) + { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_left == node) + { + register struct rb_node *tmp; + __rb_rotate_right(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_left(gparent, root); + } + } + + rb_set_black(root->rb_node); +} +EXPORT_SYMBOL(rb_insert_color); + +static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, + struct rb_root *root) +{ + struct rb_node *other; + + while ((!node || rb_is_black(node)) && node != root->rb_node) + { + if (parent->rb_left == node) + { + other = parent->rb_right; + if (rb_is_red(other)) + { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_left(parent, root); + other = parent->rb_right; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) + { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } + else + { + if (!other->rb_right || rb_is_black(other->rb_right)) + { + struct rb_node *o_left; + if ((o_left = other->rb_left)) + rb_set_black(o_left); + rb_set_red(other); + __rb_rotate_right(other, root); + other = parent->rb_right; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + if (other->rb_right) + rb_set_black(other->rb_right); + __rb_rotate_left(parent, root); + node = root->rb_node; + break; + } + } + else + { + other = parent->rb_left; + if (rb_is_red(other)) + { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_right(parent, root); + other = parent->rb_left; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) + { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } + else + { + if (!other->rb_left || rb_is_black(other->rb_left)) + { + register struct rb_node *o_right; + if ((o_right = other->rb_right)) + rb_set_black(o_right); + rb_set_red(other); + __rb_rotate_left(other, root); + other = parent->rb_left; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + if (other->rb_left) + rb_set_black(other->rb_left); + __rb_rotate_right(parent, root); + node = root->rb_node; + break; + } + } + } + if (node) + rb_set_black(node); +} + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *child, *parent; + int color; + + if (!node->rb_left) + child = node->rb_right; + else if (!node->rb_right) + child = node->rb_left; + else + { + struct rb_node *old = node, *left; + + node = node->rb_right; + while ((left = node->rb_left) != NULL) + node = left; + child = node->rb_right; + parent = rb_parent(node); + color = rb_color(node); + + if (child) + rb_set_parent(child, parent); + if (parent == old) { + parent->rb_right = child; + parent = node; + } else + parent->rb_left = child; + + node->rb_parent_color = old->rb_parent_color; + node->rb_right = old->rb_right; + node->rb_left = old->rb_left; + + if (rb_parent(old)) + { + if (rb_parent(old)->rb_left == old) + rb_parent(old)->rb_left = node; + else + rb_parent(old)->rb_right = node; + } else + root->rb_node = node; + + rb_set_parent(old->rb_left, node); + if (old->rb_right) + rb_set_parent(old->rb_right, node); + goto color; + } + + parent = rb_parent(node); + color = rb_color(node); + + if (child) + rb_set_parent(child, parent); + if (parent) + { + if (parent->rb_left == node) + parent->rb_left = child; + else + parent->rb_right = child; + } + else + root->rb_node = child; + + color: + if (color == RB_BLACK) + __rb_erase_color(child, parent, root); +} +EXPORT_SYMBOL(rb_erase); + +/* + * This function returns the first node (in sort order) of the tree. + */ +struct rb_node *rb_first(struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_left) + n = n->rb_left; + return n; +} +EXPORT_SYMBOL(rb_first); + +struct rb_node *rb_last(struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} +EXPORT_SYMBOL(rb_last); + +struct rb_node *rb_next(struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + /* If we have a right-hand child, go down and then left as far + as we can. */ + if (node->rb_right) { + node = node->rb_right; + while (node->rb_left) + node=node->rb_left; + return node; + } + + /* No right-hand children. Everything down and left is + smaller than us, so any 'next' node must be in the general + direction of our parent. Go up the tree; any time the + ancestor is a right-hand child of its parent, keep going + up. First time it's a left-hand child of its parent, said + parent is our 'next' node. */ + while ((parent = rb_parent(node)) && node == parent->rb_right) + node = parent; + + return parent; +} +EXPORT_SYMBOL(rb_next); + +struct rb_node *rb_prev(struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + /* If we have a left-hand child, go down and then right as far + as we can. */ + if (node->rb_left) { + node = node->rb_left; + while (node->rb_right) + node=node->rb_right; + return node; + } + + /* No left-hand children. Go up till we find an ancestor which + is a right-hand child of its parent */ + while ((parent = rb_parent(node)) && node == parent->rb_left) + node = parent; + + return parent; +} +EXPORT_SYMBOL(rb_prev); + +void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); + + /* Set the surrounding nodes to point to the replacement */ + if (parent) { + if (victim == parent->rb_left) + parent->rb_left = new; + else + parent->rb_right = new; + } else { + root->rb_node = new; + } + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; +} +EXPORT_SYMBOL(rb_replace_node); diff --git a/xen/common/spinlock.c b/xen/common/spinlock.c index ac2aaab814..a17f0b2124 100644 --- a/xen/common/spinlock.c +++ b/xen/common/spinlock.c @@ -214,6 +214,12 @@ unsigned long _write_lock_irqsave(rwlock_t *lock) return flags; } +int _write_trylock(rwlock_t *lock) +{ + check_lock(&lock->debug); + return _raw_write_trylock(&lock->raw); +} + void _write_unlock(rwlock_t *lock) { _raw_write_unlock(&lock->raw); @@ -236,3 +242,9 @@ int _rw_is_locked(rwlock_t *lock) check_lock(&lock->debug); return _raw_rw_is_locked(&lock->raw); } + +int _rw_is_write_locked(rwlock_t *lock) +{ + check_lock(&lock->debug); + return _raw_rw_is_write_locked(&lock->raw); +} diff --git a/xen/common/tmem.c b/xen/common/tmem.c new file mode 100644 index 0000000000..19d8bec05c --- /dev/null +++ b/xen/common/tmem.c @@ -0,0 +1,2109 @@ +/****************************************************************************** + * tmem.c + * + * Transcendent memory + * + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. + */ + +/* TODO list: 090129 + - improve on reclamation policy + - use different tlsf pools for each client (maybe each pool) + - implement page accounting and minimal QoS limits + - test shared access more completely (need pv cluster fs) + - add feedback-driven compression (not for persistent pools though!) + - add data-structure total bytes overhead stats + */ + +#ifdef __XEN__ +#include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */ +#endif + +#include <xen/tmem.h> +#include <xen/rbtree.h> +#include <xen/radix-tree.h> +#include <xen/list.h> + +#define EXPORT /* indicates code other modules are dependent upon */ +#define FORWARD + +/************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/ + +#define CLI_ID_NULL TMH_CLI_ID_NULL +#define cli_id_str tmh_cli_id_str +#define client_str tmh_client_str + +/************ DEBUG and STATISTICS (+ some compression testing) *******/ + +#ifndef NDEBUG +#define SENTINELS +#define NOINLINE noinline +#else +#define NOINLINE +#endif + +#ifdef SENTINELS +#define DECL_SENTINEL unsigned long sentinel; +#define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL +#define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL +#define ASSERT_SENTINEL(_x,_y) \ + ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL) +#ifdef __i386__ +#define POOL_SENTINEL 0x87658765 +#define OBJ_SENTINEL 0x12345678 +#define OBJNODE_SENTINEL 0xfedcba09 +#define PGD_SENTINEL 0x43214321 +#else +#define POOL_SENTINEL 0x8765876587658765 +#define OBJ_SENTINEL 0x1234567812345678 +#define OBJNODE_SENTINEL 0xfedcba0987654321 +#define PGD_SENTINEL 0x4321432143214321 +#endif +#else +#define DECL_SENTINEL +#define SET_SENTINEL(_x,_y) do { } while (0) +#define ASSERT_SENTINEL(_x,_y) do { } while (0) +#define INVERT_SENTINEL(_x,_y) do { } while (0) +#endif + +/* global statistics (none need to be locked) */ +static unsigned long total_tmem_ops = 0; +static unsigned long errored_tmem_ops = 0; +static unsigned long total_flush_pool = 0; +static unsigned long alloc_failed = 0, alloc_page_failed = 0; +static unsigned long evicted_pgs = 0, evict_attempts = 0; +static unsigned long relinq_pgs = 0, relinq_attempts = 0; +static unsigned long max_evicts_per_relinq = 0; +static unsigned long low_on_memory = 0; +static int global_obj_count_max = 0; +static int global_pgp_count_max = 0; +static int global_page_count_max = 0; +static int global_rtree_node_count_max = 0; +static long global_eph_count_max = 0; +static unsigned long failed_copies; + +DECL_CYC_COUNTER(succ_get); +DECL_CYC_COUNTER(succ_put); +DECL_CYC_COUNTER(non_succ_get); +DECL_CYC_COUNTER(non_succ_put); +DECL_CYC_COUNTER(flush); +DECL_CYC_COUNTER(flush_obj); +#ifdef COMPARE_COPY_PAGE_SSE2 +EXTERN_CYC_COUNTER(pg_copy1); +EXTERN_CYC_COUNTER(pg_copy2); +EXTERN_CYC_COUNTER(pg_copy3); +EXTERN_CYC_COUNTER(pg_copy4); +#else +EXTERN_CYC_COUNTER(pg_copy); +#endif +DECL_CYC_COUNTER(compress); +DECL_CYC_COUNTER(decompress); + +/************ CORE DATA STRUCTURES ************************************/ + +#define MAX_POOLS_PER_DOMAIN 16 +#define MAX_GLOBAL_SHARED_POOLS 16 + +struct tm_pool; +struct client { + struct list_head client_list; + struct tm_pool *pools[MAX_POOLS_PER_DOMAIN]; + tmh_client_t *tmh; + struct list_head ephemeral_page_list; + long eph_count, eph_count_max; + cli_id_t cli_id; + uint32_t weight; + uint32_t cap; + bool_t compress; + bool_t frozen; + unsigned long compress_poor, compress_nomem; + unsigned long compressed_pages; + uint64_t compressed_sum_size; +}; +typedef struct client client_t; + +struct share_list { + struct list_head share_list; + client_t *client; +}; +typedef struct share_list sharelist_t; + +#define OBJ_HASH_BUCKETS 256 /* must be power of two */ +#define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1) +#define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK) + +struct tm_pool { + bool_t shared; + bool_t persistent; + struct list_head pool_list; /* FIXME do we need this anymore? */ + client_t *client; + uint64_t uuid[2]; /* 0 for private, non-zero for shared */ + uint32_t pool_id; + rwlock_t pool_rwlock; + struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */ + struct list_head share_list; /* valid if shared */ + DECL_SENTINEL + int shared_count; /* valid if shared */ + atomic_t pgp_count; + int pgp_count_max; + long obj_count; /* atomicity depends on pool_rwlock held for write */ + long obj_count_max; + unsigned long objnode_count, objnode_count_max; + uint64_t sum_life_cycles; + uint64_t sum_evicted_cycles; + unsigned long puts, good_puts, no_mem_puts; + unsigned long dup_puts_flushed, dup_puts_replaced; + unsigned long gets, found_gets; + unsigned long flushs, flushs_found; + unsigned long flush_objs, flush_objs_found; +}; +typedef struct tm_pool pool_t; + +#define is_persistent(_p) (_p->persistent) +#define is_ephemeral(_p) (!(_p->persistent)) +#define is_shared(_p) (_p->shared) +#define is_private(_p) (!(_p->shared)) + +struct tmem_object_root { + DECL_SENTINEL + uint64_t oid; + struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */ + unsigned long objnode_count; /* atomicity depends on obj_spinlock */ + long pgp_count; /* atomicity depends on obj_spinlock */ + struct radix_tree_root tree_root; /* tree of pages within object */ + pool_t *pool; + cli_id_t last_client; + spinlock_t obj_spinlock; + bool_t no_evict; /* if globally locked, pseudo-locks against eviction */ +}; +typedef struct tmem_object_root obj_t; + +typedef struct radix_tree_node rtn_t; +struct tmem_object_node { + obj_t *obj; + DECL_SENTINEL + rtn_t rtn; +}; +typedef struct tmem_object_node objnode_t; + +struct tmem_page_descriptor { + struct list_head global_eph_pages; + struct list_head client_eph_pages; + obj_t *obj; + uint32_t index; + size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */ + union { + pfp_t *pfp; /* page frame pointer */ + char *cdata; /* compressed data */ + }; + uint64_t timestamp; + DECL_SENTINEL +}; +typedef struct tmem_page_descriptor pgp_t; + +static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */ + +static LIST_HEAD(global_client_list); +static LIST_HEAD(global_pool_list); + +static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 }; +static atomic_t client_weight_total = ATOMIC_INIT(0); +static int tmem_initialized = 0; + +/************ CONCURRENCY ***********************************************/ + +EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */ +EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */ +static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */ + +#define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0) +#define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0) +#define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0) +#define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0) +#define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0) +#define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0) +#define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l)) +#define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l)) + +#define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l)) +#define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l)) + +/* global counters (should use long_atomic_t access) */ +static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */ +static atomic_t global_obj_count = ATOMIC_INIT(0); +static atomic_t global_pgp_count = ATOMIC_INIT(0); +static atomic_t global_page_count = ATOMIC_INIT(0); +static atomic_t global_rtree_node_count = ATOMIC_INIT(0); + +#define atomic_inc_and_max(_c) do { \ + atomic_inc(&_c); \ + if ( _atomic_read(_c) > _c##_max ) \ + _c##_max = _atomic_read(_c); \ +} while (0) + +#define atomic_dec_and_assert(_c) do { \ + atomic_dec(&_c); \ + ASSERT(_atomic_read(_c) >= 0); \ +} while (0) + + +/************ MEMORY ALLOCATION INTERFACE *****************************/ + +#define tmem_malloc(_type,_pool) \ + _tmem_malloc(sizeof(_type), __alignof__(_type), _pool) + +#define tmem_malloc_bytes(_size,_pool) \ + _tmem_malloc(_size, 1, _pool) + +static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool) +{ + void *v; + + if ( (pool != NULL) && is_persistent(pool) ) + v = tmh_alloc_subpage_thispool(pool,size,align); + else + v = tmh_alloc_subpage(pool, size, align); + if ( v == NULL ) + alloc_failed++; + return v; +} + +static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool) +{ + if ( pool == NULL || !is_persistent(pool) ) + tmh_free_subpage(p,size); + else + tmh_free_subpage_thispool(pool,p,size); +} + +static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool) +{ + pfp_t *pfp = NULL; + + if ( pool != NULL && is_persistent(pool) ) + pfp = tmh_alloc_page_thispool(pool); + else + pfp = tmh_alloc_page(pool,0); + if ( pfp == NULL ) + alloc_page_failed++; + else + atomic_inc_and_max(global_page_count); + return pfp; +} + +static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp) +{ + ASSERT(pfp); + if ( pool == NULL || !is_persistent(pool) ) + tmh_free_page(pfp); + else + tmh_free_page_thispool(pool,pfp); + atomic_dec_and_assert(global_page_count); +} + +/************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/ + +/* allocate a pgp_t and associate it with an object */ +static NOINLINE pgp_t *pgp_alloc(obj_t *obj) +{ + pgp_t *pgp; + pool_t *pool; + + ASSERT(obj != NULL); + ASSERT(obj->pool != NULL); + pool = obj->pool; + if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL ) + return NULL; + pgp->obj = obj; + INIT_LIST_HEAD(&pgp->global_eph_pages); + INIT_LIST_HEAD(&pgp->client_eph_pages); + pgp->pfp = NULL; + pgp->size = -1; + pgp->index = -1; + pgp->timestamp = get_cycles(); + SET_SENTINEL(pgp,PGD); + atomic_inc_and_max(global_pgp_count); + atomic_inc_and_max(pool->pgp_count); + return pgp; +} + +static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index) +{ + ASSERT(obj != NULL); + ASSERT_SPINLOCK(&obj->obj_spinlock); + ASSERT_SENTINEL(obj,OBJ); + ASSERT(obj->pool != NULL); + ASSERT_SENTINEL(obj->pool,POOL); + return radix_tree_lookup(&obj->tree_root, index); +} + +static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool) +{ + if ( pgp->pfp == NULL ) + return; + if ( !pgp->size ) + tmem_page_free(pgp->obj->pool,pgp->pfp); + else + { + tmem_free(pgp->cdata,pgp->size,pool); + if ( pool != NULL ) + { + pool->client->compressed_pages--; + pool->client->compressed_sum_size -= pgp->size; + } + } + pgp->pfp = NULL; + pgp->size = -1; +} + +static NOINLINE void pgp_free(pgp_t *pgp, int from_delete) +{ + pool_t *pool = NULL; + + ASSERT_SENTINEL(pgp,PGD); + ASSERT(pgp->obj != NULL); + ASSERT_SENTINEL(pgp->obj,OBJ); + ASSERT_SENTINEL(pgp->obj->pool,POOL); + ASSERT(list_empty(&pgp->global_eph_pages)); + ASSERT(list_empty(&pgp->client_eph_pages)); + if ( from_delete ) + ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL); + ASSERT(pgp->obj->pool != NULL); + pool = pgp->obj->pool; + pgp_free_data(pgp, pool); + INVERT_SENTINEL(pgp,PGD); + pgp->obj = NULL; + pgp->index = -1; + pgp->size = -1; + atomic_dec_and_assert(global_pgp_count); + atomic_dec_and_assert(pool->pgp_count); + tmem_free(pgp,sizeof(pgp_t),pool); +} + +/* remove the page from appropriate lists but not from parent object */ +static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock) +{ + ASSERT(pgp != NULL); + ASSERT(pgp->obj != NULL); + ASSERT(pgp->obj->pool != NULL); + ASSERT(pgp->obj->pool->client != NULL); + if ( is_ephemeral(pgp->obj->pool) ) + { + if ( !no_eph_lock ) + tmem_spin_lock(&eph_lists_spinlock); + if ( !list_empty(&pgp->client_eph_pages) ) + pgp->obj->pool->client->eph_count--; + ASSERT(pgp->obj->pool->client->eph_count >= 0); + list_del_init(&pgp->client_eph_pages); + if ( !list_empty(&pgp->global_eph_pages) ) + global_eph_count--; + ASSERT(global_eph_count >= 0); + list_del_init(&pgp->global_eph_pages); + if ( !no_eph_lock ) + tmem_spin_unlock(&eph_lists_spinlock); + } +} + +/* remove page from lists (but not from parent object) and free it */ +static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock) +{ + uint64_t life; + + ASSERT(pgp != NULL); + ASSERT(pgp->obj != NULL); + ASSERT(pgp->obj->pool != NULL); + life = get_cycles() - pgp->timestamp; + pgp->obj->pool->sum_life_cycles += life; + pgp_delist(pgp, no_eph_lock); + pgp_free(pgp,1); +} + +/* called only indirectly by radix_tree_destroy */ +static NOINLINE void pgp_destroy(void *v) +{ + pgp_t *pgp = (pgp_t *)v; + + ASSERT_SPINLOCK(&pgp->obj->obj_spinlock); + pgp_delist(pgp,0); + ASSERT(pgp->obj != NULL); + pgp->obj->pgp_count--; + ASSERT(pgp->obj->pgp_count >= 0); + pgp_free(pgp,0); +} + +FORWARD static rtn_t *rtn_alloc(void *arg); +FORWARD static void rtn_free(rtn_t *rtn); + +static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp) +{ + int ret; + + ASSERT_SPINLOCK(&obj->obj_spinlock); + ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj); + if ( !ret ) + obj->pgp_count++; + return ret; +} + +static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index) +{ + pgp_t *pgp; + + ASSERT(obj != NULL); + ASSERT_SPINLOCK(&obj->obj_spinlock); + ASSERT_SENTINEL(obj,OBJ); + ASSERT(obj->pool != NULL); + ASSERT_SENTINEL(obj->pool,POOL); + pgp = radix_tree_delete(&obj->tree_root, index, rtn_free); + if ( pgp != NULL ) + obj->pgp_count--; + ASSERT(obj->pgp_count >= 0); + + return pgp; +} + +/************ RADIX TREE NODE MANIPULATION ROUTINES *******************/ + +/* called only indirectly from radix_tree_insert */ +static NOINLINE rtn_t *rtn_alloc(void *arg) +{ + objnode_t *objnode; + obj_t *obj = (obj_t *)arg; + + ASSERT_SENTINEL(obj,OBJ); + ASSERT(obj->pool != NULL); + ASSERT_SENTINEL(obj->pool,POOL); + objnode = tmem_malloc(objnode_t,obj->pool); + if (objnode == NULL) + return NULL; + objnode->obj = obj; + SET_SENTINEL(objnode,OBJNODE); + memset(&objnode->rtn, 0, sizeof(rtn_t)); + if (++obj->pool->objnode_count > obj->pool->objnode_count_max) + obj->pool->objnode_count_max = obj->pool->objnode_count; + atomic_inc_and_max(global_rtree_node_count); + obj->objnode_count++; + return &objnode->rtn; +} + +/* called only indirectly from radix_tree_delete/destroy */ +static void rtn_free(rtn_t *rtn) +{ + pool_t *pool; + objnode_t *objnode; + int i; + + ASSERT(rtn != NULL); + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) + ASSERT(rtn->slots[i] == NULL); + objnode = container_of(rtn,objnode_t,rtn); + ASSERT_SENTINEL(objnode,OBJNODE); + INVERT_SENTINEL(objnode,OBJNODE); + ASSERT(objnode->obj != NULL); + ASSERT_SPINLOCK(&objnode->obj->obj_spinlock); + ASSERT_SENTINEL(objnode->obj,OBJ); + pool = objnode->obj->pool; + ASSERT(pool != NULL); + ASSERT_SENTINEL(pool,POOL); + pool->objnode_count--; + objnode->obj->objnode_count--; + objnode->obj = NULL; + tmem_free(objnode,sizeof(objnode_t),pool); + atomic_dec_and_assert(global_rtree_node_count); +} + +/************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/ + +/* searches for object==oid in pool, returns locked object if found */ +static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid) +{ + struct rb_node *node; + obj_t *obj; + +restart_find: + tmem_read_lock(&pool->pool_rwlock); + node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node; + while ( node ) + { + obj = container_of(node, obj_t, rb_tree_node); + if ( obj->oid == oid ) + { + if ( tmh_lock_all ) + obj->no_evict = 1; + else + { + if ( !tmem_spin_trylock(&obj->obj_spinlock) ) + { + tmem_read_unlock(&pool->pool_rwlock); + goto restart_find; + } + tmem_read_unlock(&pool->pool_rwlock); + } + return obj; + } + else if ( oid < obj->oid ) + node = node->rb_left; + else + node = node->rb_right; + } + tmem_read_unlock(&pool->pool_rwlock); + return NULL; +} + +/* free an object that has no more pgps in it */ +static NOINLINE void obj_free(obj_t *obj, int no_rebalance) +{ + pool_t *pool; + uint64_t old_oid; + + ASSERT_SPINLOCK(&obj->obj_spinlock); + ASSERT(obj != NULL); + ASSERT_SENTINEL(obj,OBJ); + ASSERT(obj->pgp_count == 0); + pool = obj->pool; + ASSERT(pool != NULL); + ASSERT_WRITELOCK(&pool->pool_rwlock); + if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */ + radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free); + ASSERT((long)obj->objnode_count == 0); + ASSERT(obj->tree_root.rnode == NULL); + pool->obj_count--; + ASSERT(pool->obj_count >= 0); + INVERT_SENTINEL(obj,OBJ); + obj->pool = NULL; + old_oid = obj->oid; + obj->oid = -1; + obj->last_client = CLI_ID_NULL; + atomic_dec_and_assert(global_obj_count); + /* use no_rebalance only if all objects are being destroyed anyway */ + if ( !no_rebalance ) + rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]); + tmem_free(obj,sizeof(obj_t),pool); +} + +static NOINLINE void obj_rb_destroy_node(struct rb_node *node) +{ + obj_t * obj; + + if ( node == NULL ) + return; + obj_rb_destroy_node(node->rb_left); + obj_rb_destroy_node(node->rb_right); + obj = container_of(node, obj_t, rb_tree_node); + tmem_spin_lock(&obj->obj_spinlock); + ASSERT(obj->no_evict == 0); + radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free); + obj_free(obj,1); +} + +static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj) +{ + struct rb_node **new, *parent = NULL; + obj_t *this; + + new = &(root->rb_node); + while ( *new ) + { + this = container_of(*new, obj_t, rb_tree_node); + parent = *new; + if ( obj->oid < this->oid ) + new = &((*new)->rb_left); + else if ( obj->oid > this->oid ) + new = &((*new)->rb_right); + else + return 0; + } + rb_link_node(&obj->rb_tree_node, parent, new); + rb_insert_color(&obj->rb_tree_node, root); + return 1; +} + +/* + * allocate, initialize, and insert an tmem_object_root + * (should be called only if find failed) + */ +static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid) +{ + obj_t *obj; + + ASSERT(pool != NULL); + ASSERT_WRITELOCK(&pool->pool_rwlock); + if ( (obj = tmem_malloc(obj_t,pool)) == NULL ) + return NULL; + pool->obj_count++; + if (pool->obj_count > pool->obj_count_max) + pool->obj_count_max = pool->obj_count; + atomic_inc_and_max(global_obj_count); + INIT_RADIX_TREE(&obj->tree_root,0); + spin_lock_init(&obj->obj_spinlock); + obj->pool = pool; + obj->oid = oid; + obj->objnode_count = 0; + obj->pgp_count = 0; + obj->last_client = CLI_ID_NULL; + SET_SENTINEL(obj,OBJ); + tmem_spin_lock(&obj->obj_spinlock); + obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj); + obj->no_evict = 1; + ASSERT_SPINLOCK(&obj->obj_spinlock); + return obj; +} + +/* free an object after destroying any pgps in it */ +static NOINLINE void obj_destroy(obj_t *obj) +{ + ASSERT_WRITELOCK(&obj->pool->pool_rwlock); + radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free); + obj_free(obj,0); +} + +/* destroy all objects in a pool */ +static NOINLINE void obj_rb_destroy_all(pool_t *pool) +{ + int i; + + tmem_write_lock(&pool->pool_rwlock); + for (i = 0; i < OBJ_HASH_BUCKETS; i++) + obj_rb_destroy_node(pool->obj_rb_root[i].rb_node); + tmem_write_unlock(&pool->pool_rwlock); +} + +/* destroys all objects in a pool that have last_client set to cli_id */ +static void obj_free_selective(pool_t *pool, cli_id_t cli_id) +{ + struct rb_node *node; + obj_t *obj; + int i; + + tmem_write_lock(&pool->pool_rwlock); + for (i = 0; i < OBJ_HASH_BUCKETS; i++) + { + node = rb_first(&pool->obj_rb_root[i]); + while ( node != NULL ) + { + obj = container_of(node, obj_t, rb_tree_node); + tmem_spin_lock(&obj->obj_spinlock); + node = rb_next(node); + if ( obj->last_client == cli_id ) + obj_destroy(obj); + else + tmem_spin_unlock(&obj->obj_spinlock); + } + } + tmem_write_unlock(&pool->pool_rwlock); +} + + +/************ POOL MANIPULATION ROUTINES ******************************/ + +static pool_t * pool_alloc(void) +{ + pool_t *pool; + int i; + + if ( (pool = tmem_malloc(pool_t,NULL)) == NULL ) + return NULL; + for (i = 0; i < OBJ_HASH_BUCKETS; i++) + pool->obj_rb_root[i] = RB_ROOT; + INIT_LIST_HEAD(&pool->pool_list); + rwlock_init(&pool->pool_rwlock); + pool->pgp_count_max = pool->obj_count_max = 0; + pool->objnode_count = pool->objnode_count_max = 0; + atomic_set(&pool->pgp_count,0); + pool->obj_count = 0; + pool->good_puts = pool->puts = pool->dup_puts_flushed = 0; + pool->dup_puts_replaced = pool->no_mem_puts = 0; + pool->found_gets = pool->gets = 0; + pool->flushs_found = pool->flushs = 0; + pool->flush_objs_found = pool->flush_objs = 0; + SET_SENTINEL(pool,POOL); + return pool; +} + +static NOINLINE void pool_free(pool_t *pool) +{ + ASSERT_SENTINEL(pool,POOL); + INVERT_SENTINEL(pool,POOL); + pool->client = NULL; + list_del(&pool->pool_list); + tmem_free(pool,sizeof(pool_t),NULL); +} + +/* register new_client as a user of this shared pool and return new + total number of registered users */ +static int shared_pool_join(pool_t *pool, client_t *new_client) +{ + sharelist_t *sl; + + ASSERT(is_shared(pool)); + if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL ) + return -1; + sl->client = new_client; + list_add_tail(&sl->share_list, &pool->share_list); + printk("adding new %s %d to shared pool owned by %s %d\n", + client_str, new_client->cli_id, client_str, pool->client->cli_id); + return ++pool->shared_count; +} + +/* reassign "ownership" of the pool to another client that shares this pool */ +static NOINLINE void shared_pool_reassign(pool_t *pool) +{ + sharelist_t *sl; + int poolid; + client_t *old_client = pool->client, *new_client; + + ASSERT(is_shared(pool)); + if ( list_empty(&pool->share_list) ) + { + ASSERT(pool->shared_count == 0); + return; + } + old_client->pools[pool->pool_id] = NULL; + sl = list_entry(pool->share_list.next, sharelist_t, share_list); + ASSERT(sl->client != old_client); + pool->client = new_client = sl->client; + for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++) + if (new_client->pools[poolid] == pool) + break; + ASSERT(poolid != MAX_POOLS_PER_DOMAIN); + printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n", + cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid); + pool->pool_id = poolid; +} + +/* destroy all objects with last_client same as passed cli_id, + remove pool's cli_id from list of sharers of this pool */ +static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id) +{ + sharelist_t *sl; + int s_poolid; + + ASSERT(is_shared(pool)); + ASSERT(pool->client != NULL); + + obj_free_selective(pool,cli_id); + list_for_each_entry(sl,&pool->share_list, share_list) + { + if (sl->client->cli_id != cli_id) + continue; + list_del(&sl->share_list); + tmem_free(sl,sizeof(sharelist_t),pool); + --pool->shared_count; + if (pool->client->cli_id == cli_id) + shared_pool_reassign(pool); + if (pool->shared_count) + return pool->shared_count; + for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++) + if ( (global_shared_pools[s_poolid]) == pool ) + { + global_shared_pools[s_poolid] = NULL; + break; + } + return 0; + } + printk("tmem: no match unsharing pool, %s=%d\n", + cli_id_str,pool->client->cli_id); + return -1; +} + +/* flush all data (owned by cli_id) from a pool and, optionally, free it */ +static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy) +{ + ASSERT(pool != NULL); + if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) ) + { + printk("tmem: unshared shared pool %d from %s=%d\n", + pool->pool_id, cli_id_str,pool->client->cli_id); + return; + } + printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing", + is_persistent(pool) ? "persistent" : "ephemeral" , + is_shared(pool) ? "shared" : "private"); + printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id); + obj_rb_destroy_all(pool); + if ( destroy ) + { + pool->client->pools[pool->pool_id] = NULL; + pool_free(pool); + } +} + +/************ CLIENT MANIPULATION OPERATIONS **************************/ + +static client_t *client_create(void) +{ + client_t *client = tmem_malloc(client_t,NULL); + cli_id_t cli_id = tmh_get_cli_id_from_current(); + + printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id); + if ( client == NULL ) + { + printk("failed... out of memory\n"); + return NULL; + } + memset(client,0,sizeof(client_t)); + if ( (client->tmh = tmh_client_init()) == NULL ) + { + printk("failed... can't allocate host-dependent part of client\n"); + if ( client ) + tmem_free(client,sizeof(client_t),NULL); + return NULL; + } + tmh_set_current_client(client); + client->cli_id = cli_id; +#ifdef __i386__ + client->compress = 0; +#else + client->compress = tmh_compression_enabled(); +#endif + list_add_tail(&client->client_list, &global_client_list); + INIT_LIST_HEAD(&client->ephemeral_page_list); + client->eph_count = client->eph_count_max = 0; + printk("ok\n"); + return client; +} + +static void client_free(client_t *client) +{ + list_del(&client->client_list); + tmh_client_destroy(client->tmh); + tmh_set_current_client(NULL); + tmem_free(client,sizeof(client_t),NULL); +} + +/* flush all data from a client and, optionally, free it */ +static void client_flush(client_t *client, bool_t destroy) +{ + int i; + pool_t *pool; + + for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++) + { + if ( (pool = client->pools[i]) == NULL ) + continue; + pool_flush(pool,client->cli_id,destroy); + if ( destroy ) + client->pools[i] = NULL; + } + if ( destroy ) + client_free(client); +} + +static bool_t client_over_quota(client_t *client) +{ + int total = _atomic_read(client_weight_total); + + ASSERT(client != NULL); + if ( (total == 0) || (client->weight == 0) || + (client->eph_count == 0) ) + return 0; + return ( ((global_eph_count*100L) / client->eph_count ) > + ((total*100L) / client->weight) ); +} + +/************ MEMORY REVOCATION ROUTINES *******************************/ + +static int tmem_evict(void) +{ + client_t *client = tmh_client_from_current(); + pgp_t *pgp = NULL, *pgp_del; + obj_t *obj; + pool_t *pool; + int ret = 0; + bool_t hold_pool_rwlock = 0; + + evict_attempts++; + tmem_spin_lock(&eph_lists_spinlock); + if ( (client != NULL) && client_over_quota(client) && + !list_empty(&client->ephemeral_page_list) ) + { + list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages) + { + obj = pgp->obj; + pool = obj->pool; + if ( tmh_lock_all && !obj->no_evict ) + goto found; + if ( tmem_spin_trylock(&obj->obj_spinlock) ) + { + if ( obj->pgp_count > 1 ) + goto found; + if ( tmem_write_trylock(&pool->pool_rwlock) ) + { + hold_pool_rwlock = 1; + goto found; + } + tmem_spin_unlock(&obj->obj_spinlock); + } + } + } else if ( list_empty(&global_ephemeral_page_list) ) { + goto out; + } else { + list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages) + { + obj = pgp->obj; + pool = obj->pool; + if ( tmh_lock_all && !obj->no_evict ) + goto found; + if ( tmem_spin_trylock(&obj->obj_spinlock) ) + { + if ( obj->pgp_count > 1 ) + goto found; + if ( tmem_write_trylock(&pool->pool_rwlock) ) + { + hold_pool_rwlock = 1; + goto found; + } + tmem_spin_unlock(&obj->obj_spinlock); + } + } + } + + ret = 0; + goto out; + +found: + ASSERT(pgp != NULL); + ASSERT_SENTINEL(pgp,PGD); + obj = pgp->obj; + ASSERT(obj != NULL); + ASSERT(obj->no_evict == 0); + ASSERT(obj->pool != NULL); + ASSERT_SENTINEL(obj,OBJ); + + ASSERT_SPINLOCK(&obj->obj_spinlock); + pgp_del = pgp_delete_from_obj(obj, pgp->index); + ASSERT(pgp_del == pgp); + pgp_delete(pgp,1); + if ( obj->pgp_count == 0 ) + { + ASSERT_WRITELOCK(&pool->pool_rwlock); + obj_free(obj,0); + } + else + tmem_spin_unlock(&obj->obj_spinlock); + if ( hold_pool_rwlock ) + tmem_write_unlock(&pool->pool_rwlock); + evicted_pgs++; + ret = 1; + +out: + tmem_spin_unlock(&eph_lists_spinlock); + return ret; +} + +static unsigned long tmem_relinquish_npages(unsigned long n) +{ + unsigned long avail_pages = 0; + + while ( (avail_pages = tmh_avail_pages()) < n ) + { + if ( !tmem_evict() ) + break; + } + if ( avail_pages ) + tmh_release_avail_pages_to_host(); + return avail_pages; +} + +/************ TMEM CORE OPERATIONS ************************************/ + +static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn) +{ + void *dst, *p; + size_t size; + int ret = 0; + DECL_LOCAL_CYC_COUNTER(compress); + + ASSERT(pgp != NULL); + ASSERT(pgp->obj != NULL); + ASSERT_SPINLOCK(&pgp->obj->obj_spinlock); + ASSERT(pgp->obj->pool != NULL); + ASSERT(pgp->obj->pool->client != NULL); +#ifdef __i386__ + return -ENOMEM; +#endif + if ( pgp->pfp != NULL ) + pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */ + START_CYC_COUNTER(compress); + ret = tmh_compress_from_client(cmfn, &dst, &size); + if ( (ret == -EFAULT) || (ret == 0) ) + goto out; + else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) + ret = 0; + else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL ) + ret = -ENOMEM; + else + { + memcpy(p,dst,size); + pgp->cdata = p; + pgp->size = size; + pgp->obj->pool->client->compressed_pages++; + pgp->obj->pool->client->compressed_sum_size += size; + ret = 1; + } + +out: + END_CYC_COUNTER(compress); + return ret; +} + +static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn, + uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len) +{ + pool_t *pool; + obj_t *obj; + client_t *client; + pgp_t *pgpfound = NULL; + int ret; + + /* if we can successfully manipulate pgp to change out the data, do so */ + ASSERT(pgp != NULL); + ASSERT(pgp->pfp != NULL); + ASSERT(pgp->size != -1); + obj = pgp->obj; + ASSERT_SPINLOCK(&obj->obj_spinlock); + ASSERT(obj != NULL); + pool = obj->pool; + ASSERT(pool != NULL); + client = pool->client; + if ( len != 0 && tmh_compression_enabled() && + client->compress && pgp->size != 0 ) + { + ret = do_tmem_put_compress(pgp,cmfn); + if ( ret == 1 ) + goto done; + else if ( ret == 0 ) + goto copy_uncompressed; + else if ( ret == -ENOMEM ) + goto failed_dup; + else if ( ret == -EFAULT ) + goto bad_copy; + } + +copy_uncompressed: + if ( pgp->pfp ) + pgp_free_data(pgp, pool); + if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL ) + goto failed_dup; + /* tmh_copy_from_client properly handles len==0 and offsets != 0 */ + ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len); + if ( ret == -EFAULT ) + goto bad_copy; + pgp->size = 0; + +done: + /* successfully replaced data, clean up and return success */ + if ( is_shared(pool) ) + obj->last_client = client->cli_id; + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); + pool->dup_puts_replaced++; + pool->good_puts++; + return 1; + +bad_copy: + /* this should only happen if the client passed a bad mfn */ + failed_copies++; +ASSERT(0); + return -EFAULT; + +failed_dup: + /* couldn't change out the data, flush the old data and return + * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */ + pgpfound = pgp_delete_from_obj(obj, pgp->index); + ASSERT(pgpfound == pgp); + pgp_delete(pgpfound,0); + if ( obj->pgp_count == 0 ) + { + tmem_write_lock(&pool->pool_rwlock); + obj_free(obj,0); + tmem_write_unlock(&pool->pool_rwlock); + } else { + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); + } + pool->dup_puts_flushed++; + return -ENOSPC; +} + + +static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index, + tmem_cli_mfn_t cmfn, uint32_t tmem_offset, + uint32_t pfn_offset, uint32_t len) +{ + obj_t *obj = NULL, *objfound = NULL, *objnew = NULL; + pgp_t *pgp = NULL, *pgpdel = NULL; + client_t *client = pool->client; + int ret = client->frozen ? -EFROZEN : -ENOMEM; + + ASSERT(pool != NULL); + pool->puts++; + /* does page already exist (dup)? if so, handle specially */ + if ( (obj = objfound = obj_find(pool,oid)) != NULL ) + { + ASSERT_SPINLOCK(&objfound->obj_spinlock); + if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL) + return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len); + } + + /* no puts allowed into a frozen pool (except dup puts) */ + if ( client->frozen ) + goto free; + + if ( (objfound == NULL) ) + { + tmem_write_lock(&pool->pool_rwlock); + if ( (obj = objnew = obj_new(pool,oid)) == NULL ) + { + tmem_write_unlock(&pool->pool_rwlock); + return -ENOMEM; + } + ASSERT_SPINLOCK(&objnew->obj_spinlock); + tmem_write_unlock(&pool->pool_rwlock); + } + + ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound)); + ASSERT_SPINLOCK(&obj->obj_spinlock); + if ( (pgp = pgp_alloc(obj)) == NULL ) + goto free; + + ret = pgp_add_to_obj(obj, index, pgp); + if ( ret == -ENOMEM ) + /* warning, may result in partially built radix tree ("stump") */ + goto free; + ASSERT(ret != -EEXIST); + pgp->index = index; + + if ( len != 0 && tmh_compression_enabled() && client->compress ) + { + ASSERT(pgp->pfp == NULL); + ret = do_tmem_put_compress(pgp,cmfn); + if ( ret == 1 ) + goto insert_page; + if ( ret == -ENOMEM ) + { + client->compress_nomem++; + goto delete_and_free; + } + if ( ret == 0 ) + { + client->compress_poor++; + goto copy_uncompressed; + } + if ( ret == -EFAULT ) + goto bad_copy; + } + +copy_uncompressed: + if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL ) + { + ret == -ENOMEM; + goto delete_and_free; + } + /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */ + ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len); + if ( ret == -EFAULT ) + goto bad_copy; + pgp->size = 0; + +insert_page: + if ( is_ephemeral(pool) ) + { + tmem_spin_lock(&eph_lists_spinlock); + list_add_tail(&pgp->global_eph_pages, + &global_ephemeral_page_list); + if (++global_eph_count > global_eph_count_max) + global_eph_count_max = global_eph_count; + list_add_tail(&pgp->client_eph_pages, + &client->ephemeral_page_list); + if (++client->eph_count > client->eph_count_max) + client->eph_count_max = client->eph_count; + tmem_spin_unlock(&eph_lists_spinlock); + } + ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound)); + if ( is_shared(pool) ) + obj->last_client = client->cli_id; + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); + pool->good_puts++; + return 1; + +delete_and_free: + ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1)); + pgpdel = pgp_delete_from_obj(obj, pgp->index); + ASSERT(pgp == pgpdel); + +free: + if ( pgp ) + pgp_delete(pgp,0); + if ( objfound ) + { + objfound->no_evict = 0; + tmem_spin_unlock(&objfound->obj_spinlock); + } + if ( objnew ) + { + tmem_write_lock(&pool->pool_rwlock); + obj_free(objnew,0); + tmem_write_unlock(&pool->pool_rwlock); + } + pool->no_mem_puts++; + return ret; + +bad_copy: + /* this should only happen if the client passed a bad mfn */ + failed_copies++; +ASSERT(0); + goto free; +} + +static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index, + tmem_cli_mfn_t cmfn, uint32_t tmem_offset, + uint32_t pfn_offset, uint32_t len) +{ + obj_t *obj; + pgp_t *pgp; + client_t *client = pool->client; + DECL_LOCAL_CYC_COUNTER(decompress); + + if ( !_atomic_read(pool->pgp_count) ) + return -EEMPTY; + + pool->gets++; + obj = obj_find(pool,oid); + if ( obj == NULL ) + return 0; + + ASSERT_SPINLOCK(&obj->obj_spinlock); + if (is_shared(pool) || is_persistent(pool) ) + pgp = pgp_lookup_in_obj(obj, index); + else + pgp = pgp_delete_from_obj(obj, index); + if ( pgp == NULL ) + { + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); + return 0; + } + ASSERT(pgp->size != -1); + if ( pgp->size != 0 ) + { + START_CYC_COUNTER(decompress); + if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT ) + goto bad_copy; + END_CYC_COUNTER(decompress); + } + else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, + pfn_offset, len) == -EFAULT) + goto bad_copy; + if ( is_ephemeral(pool) ) + { + if ( is_private(pool) ) + { + pgp_delete(pgp,0); + if ( obj->pgp_count == 0 ) + { + tmem_write_lock(&pool->pool_rwlock); + obj_free(obj,0); + obj = NULL; + tmem_write_unlock(&pool->pool_rwlock); + } + } else { + tmem_spin_lock(&eph_lists_spinlock); + list_del(&pgp->global_eph_pages); + list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list); + list_del(&pgp->client_eph_pages); + list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list); + tmem_spin_unlock(&eph_lists_spinlock); + ASSERT(obj != NULL); + obj->last_client = tmh_get_cli_id_from_current(); + } + } + if ( obj != NULL ) + { + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); + } + pool->found_gets++; + return 1; + +bad_copy: + /* this should only happen if the client passed a bad mfn */ + failed_copies++; +ASSERT(0); + return -EFAULT; + +} + +static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index) +{ + obj_t *obj; + pgp_t *pgp; + + pool->flushs++; + obj = obj_find(pool,oid); + if ( obj == NULL ) + goto out; + pgp = pgp_delete_from_obj(obj, index); + if ( pgp == NULL ) + { + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); + goto out; + } + pgp_delete(pgp,0); + if ( obj->pgp_count == 0 ) + { + tmem_write_lock(&pool->pool_rwlock); + obj_free(obj,0); + tmem_write_unlock(&pool->pool_rwlock); + } else { + obj->no_evict = 0; + tmem_spin_unlock(&obj->obj_spinlock); + } + pool->flushs_found++; + +out: + if ( pool->client->frozen ) + return -EFROZEN; + else + return 1; +} + +static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid) +{ + obj_t *obj; + + pool->flush_objs++; + obj = obj_find(pool,oid); + if ( obj == NULL ) + goto out; + tmem_write_lock(&pool->pool_rwlock); + obj_destroy(obj); + pool->flush_objs_found++; + tmem_write_unlock(&pool->pool_rwlock); + +out: + if ( pool->client->frozen ) + return -EFROZEN; + else + return 1; +} + +static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id) +{ + client_t *client = tmh_client_from_current(); + pool_t *pool; + + if ( client->pools == NULL ) + return 0; + if ( (pool = client->pools[pool_id]) == NULL ) + return 0; + client->pools[pool_id] = NULL; + pool_flush(pool,client->cli_id,1); + return 1; +} + +static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi) +{ + client_t *client = tmh_client_from_current(); + cli_id_t cli_id = tmh_get_cli_id_from_current(); + int persistent = flags & TMEM_POOL_PERSIST; + int shared = flags & TMEM_POOL_SHARED; + int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT) + & TMEM_POOL_PAGESIZE_MASK; + int specversion = (flags >> TMEM_POOL_VERSION_SHIFT) + & TMEM_POOL_VERSION_MASK; + pool_t *pool, *shpool; + int s_poolid, d_poolid, first_unused_s_poolid; + + ASSERT(client != NULL); + printk("tmem: allocating %s-%s tmem pool for %s=%d...", + persistent ? "persistent" : "ephemeral" , + shared ? "shared" : "private", cli_id_str, cli_id); + if ( specversion != 0 ) + { + printk("failed... unsupported spec version\n"); + return -EPERM; + } + if ( pagebits != (PAGE_SHIFT - 12) ) + { + printk("failed... unsupported pagesize %d\n",1<<(pagebits+12)); + return -EPERM; + } + if ( (pool = pool_alloc()) == NULL ) + { + printk("failed... out of memory\n"); + return -ENOMEM; + } + for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ ) + if ( client->pools[d_poolid] == NULL ) + break; + if ( d_poolid == MAX_POOLS_PER_DOMAIN ) + { + printk("failed... no more pool slots available for this %s\n", + client_str); + goto fail; + } + pool->shared = shared; + pool->client = client; + if ( shared ) + { + first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS; + for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ ) + { + if ( (shpool = global_shared_pools[s_poolid]) != NULL ) + { + if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi ) + { + printk("(matches shared pool uuid=%"PRIx64".%"PRIu64") ", + uuid_hi, uuid_lo); + printk("pool_id=%d\n",d_poolid); + client->pools[d_poolid] = global_shared_pools[s_poolid]; + shared_pool_join(global_shared_pools[s_poolid], client); + pool_free(pool); + return d_poolid; + } + } + else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS ) + first_unused_s_poolid = s_poolid; + } + if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS ) + { + printk("tmem: failed... no global shared pool slots available\n"); + goto fail; + } + else + { + INIT_LIST_HEAD(&pool->share_list); + pool->shared_count = 0; + global_shared_pools[first_unused_s_poolid] = pool; + (void)shared_pool_join(pool,client); + } + } + client->pools[d_poolid] = pool; + list_add_tail(&pool->pool_list, &global_pool_list); + pool->pool_id = d_poolid; + pool->persistent = persistent; + pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi; + printk("pool_id=%d\n",d_poolid); + return d_poolid; + +fail: + pool_free(pool); + return -EPERM; +} + +/************ TMEM CONTROL OPERATIONS ************************************/ + +/* freeze/thaw all pools belonging to client cli_id (all domains if -1) */ +static int tmemc_freeze_pools(int cli_id, int arg) +{ + client_t *client; + bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0; + bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0; + char *s; + + s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" ); + if ( cli_id == CLI_ID_NULL ) + { + list_for_each_entry(client,&global_client_list,client_list) + { + client->frozen = freeze; + printk("tmem: all pools %s for all %ss\n",s,client_str); + } + } + else + { + if ( (client = tmh_client_from_cli_id(cli_id)) == NULL) + return -1; + client->frozen = freeze; + printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id); + } + return 0; +} + +static int tmemc_flush_mem(int cli_id, uint32_t kb) +{ + uint32_t npages, flushed_pages, flushed_kb; + + if ( cli_id != CLI_ID_NULL ) + { + printk("tmem: %s-specific flush not supported yet, use --all\n", + client_str); + return -1; + } + /* convert kb to pages, rounding up if necessary */ + npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10); + flushed_pages = tmem_relinquish_npages(npages); + flushed_kb = flushed_pages << (PAGE_SHIFT-10); + return flushed_kb; +} + +/* + * These tmemc_list* routines output lots of stats in a format that is + * intended to be program-parseable, not human-readable. Further, by + * tying each group of stats to a line format indicator (e.g. G= for + * global stats) and each individual stat to a two-letter specifier + * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the + * global ephemeral pool), it should allow the stats reported to be + * forward and backwards compatible as tmem evolves. + */ +#define BSIZE 1024 + +static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off, + uint32_t len, bool_t use_long) +{ + char info[BSIZE]; + int i, n = 0, sum = 0; + pool_t *p; + bool_t s; + + n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d%c", + c->cli_id, c->weight, c->cap, c->compress, + c->frozen, use_long ? ',' : '\n'); + if (use_long) + n += scnprintf(info+n,BSIZE-n, + "Ec:%ld,Em:%ld,cp:%ld,cb:%lld,cn:%ld,cm:%ld\n", + c->eph_count, c->eph_count_max, + c->compressed_pages, (long long)c->compressed_sum_size, + c->compress_poor, c->compress_nomem); + tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); + sum += n; + for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ ) + { + if ( (p = c->pools[i]) == NULL ) + continue; + s = is_shared(p); + n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,PT:%c%c,U0:%llx,U1:%llx%c", + c->cli_id, p->pool_id, + is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P', + s ? p->uuid[0] : 0LL, s ? p->uuid[1] : 0LL, + use_long ? ',' : '\n'); + if (use_long) + n += scnprintf(info+n,BSIZE-n, + "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu," + "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu," + "fs:%lu,ft:%lu,os:%lu,ot:%lu\n", + _atomic_read(p->pgp_count), p->pgp_count_max, + p->obj_count, p->obj_count_max, + p->objnode_count, p->objnode_count_max, + p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced, + p->no_mem_puts, + p->found_gets, p->gets, + p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs); + if ( sum + n >= len ) + return sum; + tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); + sum += n; + } + return sum; +} + +static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len, + bool_t use_long) +{ + char info[BSIZE]; + int i, n = 0, sum = 0; + pool_t *p; + sharelist_t *sl; + + for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ ) + { + if ( (p = global_shared_pools[i]) == NULL ) + continue; + n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%llx,U1:%llx", + i, is_persistent(p) ? 'P' : 'E', is_shared(p) ? 'S' : 'P', + (unsigned long long)p->uuid[0], (unsigned long long)p->uuid[1]); + list_for_each_entry(sl,&p->share_list, share_list) + n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id); + n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n'); + if (use_long) + n += scnprintf(info+n,BSIZE-n, + "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu," + "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu," + "fs:%lu,ft:%lu,os:%lu,ot:%lu\n", + _atomic_read(p->pgp_count), p->pgp_count_max, + p->obj_count, p->obj_count_max, + p->objnode_count, p->objnode_count_max, + p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced, + p->no_mem_puts, + p->found_gets, p->gets, + p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs); + if ( sum + n >= len ) + return sum; + tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); + sum += n; + } + return sum; +} + +#ifdef TMEM_PERF +static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len, + bool_t use_long) +{ + char info[BSIZE]; + int n = 0, sum = 0; + + n = scnprintf(info+n,BSIZE-n,"T="); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O"); +#ifdef COMPARE_COPY_PAGE_SSE2 + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4"); +#else + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C"); +#endif + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c"); + n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d"); + n--; /* overwrite trailing comma */ + n += scnprintf(info+n,BSIZE-n,"\n"); + if ( sum + n >= len ) + return sum; + tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); + sum += n; + return sum; +} +#else +#define tmemc_list_global_perf(_buf,_off,_len,_use) (0) +#endif + +static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len, + bool_t use_long) +{ + char info[BSIZE]; + int n = 0, sum = off; + + n += scnprintf(info,BSIZE,"G=" + "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu," + "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c", + total_tmem_ops, errored_tmem_ops, failed_copies, + alloc_failed, alloc_page_failed, tmh_avail_pages(), + low_on_memory, evicted_pgs, + evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq, + total_flush_pool, use_long ? ',' : '\n'); + if (use_long) + n += scnprintf(info+n,BSIZE-n, + "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n", + global_eph_count, global_eph_count_max, + _atomic_read(global_obj_count), global_obj_count_max, + _atomic_read(global_rtree_node_count), global_rtree_node_count_max, + _atomic_read(global_pgp_count), global_pgp_count_max); + if ( sum + n >= len ) + return sum; + tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); + sum += n; + return sum; +} + +static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len, + bool_t use_long) +{ + client_t *client; + int off = 0; + + if ( cli_id == CLI_ID_NULL ) { + off = tmemc_list_global(buf,0,len,use_long); + off += tmemc_list_shared(buf,off,len-off,use_long); + list_for_each_entry(client,&global_client_list,client_list) + off += tmemc_list_client(client, buf, off, len-off, use_long); + off += tmemc_list_global_perf(buf,off,len-off,use_long); + } + else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL) + return -1; + else + off = tmemc_list_client(client, buf, 0, len, use_long); + + + return 0; +} + +static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1) +{ + cli_id_t cli_id = client->cli_id; + uint32_t old_weight; + + switch (subop) + { + case TMEMC_SET_WEIGHT: + old_weight = client->weight; + client->weight = arg1; + printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id); + atomic_sub(old_weight,&client_weight_total); + atomic_add(client->weight,&client_weight_total); + break; + case TMEMC_SET_CAP: + client->cap = arg1; + printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id); + break; + case TMEMC_SET_COMPRESS: + client->compress = arg1 ? 1 : 0; + printk("tmem: compression %s for %s=%d\n", + arg1 ? "enabled" : "disabled",cli_id_str,cli_id); + break; + default: + printk("tmem: unknown subop %d for tmemc_set_var\n",subop); + return -1; + } + return 0; +} + +static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1) +{ + client_t *client; + + if ( cli_id == CLI_ID_NULL ) + list_for_each_entry(client,&global_client_list,client_list) + tmemc_set_var_one(client, subop, arg1); + else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL) + return -1; + else + tmemc_set_var_one(client, subop, arg1); + return 0; +} + +static int do_tmem_control(uint32_t subop, uint32_t cli_id32, + uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf) +{ + int ret; + cli_id_t cli_id = (cli_id_t)cli_id32; + + if (!tmh_current_is_privileged()) + { + /* don't fail... mystery: sometimes dom0 fails here */ + /* return -EPERM; */ + } + switch(subop) + { + case TMEMC_THAW: + case TMEMC_FREEZE: + case TMEMC_DESTROY: + ret = tmemc_freeze_pools(cli_id,subop); + break; + case TMEMC_FLUSH: + ret = tmemc_flush_mem(cli_id,arg1); + break; + case TMEMC_LIST: + ret = tmemc_list(cli_id,buf,arg1,arg2); + break; + case TMEMC_SET_WEIGHT: + case TMEMC_SET_CAP: + case TMEMC_SET_COMPRESS: + ret = tmemc_set_var(cli_id,subop,arg1); + break; + default: + ret = -1; + } + return ret; +} + +/************ EXPORTed FUNCTIONS **************************************/ + +EXPORT long do_tmem_op(tmem_cli_op_t uops) +{ + struct tmem_op op; + client_t *client = tmh_client_from_current(); + pool_t *pool = NULL; + int rc = 0; + bool_t succ_get = 0, succ_put = 0; + bool_t non_succ_get = 0, non_succ_put = 0; + bool_t flush = 0, flush_obj = 0; + bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0; + static bool_t warned = 0; + DECL_LOCAL_CYC_COUNTER(succ_get); + DECL_LOCAL_CYC_COUNTER(succ_put); + DECL_LOCAL_CYC_COUNTER(non_succ_get); + DECL_LOCAL_CYC_COUNTER(non_succ_put); + DECL_LOCAL_CYC_COUNTER(flush); + DECL_LOCAL_CYC_COUNTER(flush_obj); + + if ( !tmem_initialized ) + { + if ( !warned ) + printk("tmem: must specify tmem parameter on xen boot line\n"); + warned = 1; + return -ENODEV; + } + + total_tmem_ops++; + + if ( tmh_lock_all ) + { + if ( tmh_lock_all > 1 ) + spin_lock_irq(&tmem_spinlock); + else + spin_lock(&tmem_spinlock); + } + + START_CYC_COUNTER(succ_get); + DUP_START_CYC_COUNTER(succ_put,succ_get); + DUP_START_CYC_COUNTER(non_succ_get,succ_get); + DUP_START_CYC_COUNTER(non_succ_put,succ_get); + DUP_START_CYC_COUNTER(flush,succ_get); + DUP_START_CYC_COUNTER(flush_obj,succ_get); + + if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) ) + { + printk("tmem: can't get tmem struct from %s\n",client_str); + rc = -EFAULT; + goto out; + } + + if ( op.cmd == TMEM_CONTROL ) + { + tmem_write_lock(&tmem_rwlock); + tmem_write_lock_set = 1; + rc = do_tmem_control(op.subop, op.cli_id, op.arg1, op.arg2, op.buf); + goto out; + } + + /* create per-client tmem structure dynamically on first use by client */ + if ( client == NULL ) + { + tmem_write_lock(&tmem_rwlock); + tmem_write_lock_set = 1; + if ( (client = client_create()) == NULL ) + { + printk("tmem: can't create tmem structure for %s\n",client_str); + rc = -ENOMEM; + goto out; + } + } + + if ( op.cmd == TMEM_NEW_POOL ) + { + if ( !tmem_write_lock_set ) + { + tmem_write_lock(&tmem_rwlock); + tmem_write_lock_set = 1; + } + } + else + { + if ( !tmem_write_lock_set ) + { + tmem_read_lock(&tmem_rwlock); + tmem_read_lock_set = 1; + } + if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) || + ((pool = client->pools[op.pool_id]) == NULL) ) + { + rc = -ENODEV; + printk("tmem: operation requested on uncreated pool\n"); + goto out; + } + ASSERT_SENTINEL(pool,POOL); + } + + switch ( op.cmd ) + { + case TMEM_NEW_POOL: + rc = do_tmem_new_pool(op.flags,op.uuid[0],op.uuid[1]); + break; + case TMEM_NEW_PAGE: + rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, 0); + break; + case TMEM_PUT_PAGE: + rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE); + if (rc == 1) succ_put = 1; + else non_succ_put = 1; + break; + case TMEM_GET_PAGE: + rc = do_tmem_get(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE); + if (rc == 1) succ_get = 1; + else non_succ_get = 1; + break; + case TMEM_FLUSH_PAGE: + flush = 1; + rc = do_tmem_flush_page(pool, op.object, op.index); + break; + case TMEM_FLUSH_OBJECT: + rc = do_tmem_flush_object(pool, op.object); + flush_obj = 1; + break; + case TMEM_DESTROY_POOL: + flush = 1; + rc = do_tmem_destroy_pool(op.pool_id); + break; + case TMEM_READ: + rc = do_tmem_get(pool, op.object, op.index, op.cmfn, + op.tmem_offset, op.pfn_offset, op.len); + break; + case TMEM_WRITE: + rc = do_tmem_put(pool, op.object, op.index, op.cmfn, + op.tmem_offset, op.pfn_offset, op.len); + break; + case TMEM_XCHG: + /* need to hold global lock to ensure xchg is atomic */ + printk("tmem_xchg op not implemented yet\n"); + rc = 0; + break; + default: + printk("tmem: op %d not implemented\n", op.cmd); + rc = 0; + break; + } + +out: + if ( rc < 0 ) + errored_tmem_ops++; + if ( succ_get ) + END_CYC_COUNTER(succ_get); + else if ( succ_put ) + END_CYC_COUNTER(succ_put); + else if ( non_succ_get ) + END_CYC_COUNTER(non_succ_get); + else if ( non_succ_put ) + END_CYC_COUNTER(non_succ_put); + else if ( flush ) + END_CYC_COUNTER(flush); + else + END_CYC_COUNTER(flush_obj); + + if ( tmh_lock_all ) + { + if ( tmh_lock_all > 1 ) + spin_unlock_irq(&tmem_spinlock); + else + spin_unlock(&tmem_spinlock); + } else { + if ( tmem_write_lock_set ) + write_unlock(&tmem_rwlock); + else if ( tmem_read_lock_set ) + read_unlock(&tmem_rwlock); + else + ASSERT(0); + } + + return rc; +} + +/* this should be called when the host is destroying a client */ +EXPORT void tmem_destroy(void *v) +{ + client_t *client = (client_t *)v; + + if ( tmh_lock_all ) + spin_lock(&tmem_spinlock); + else + write_lock(&tmem_rwlock); + + if ( client == NULL ) + printk("tmem: can't destroy tmem pools for %s=%d\n", + cli_id_str,client->cli_id); + else + { + printk("tmem: flushing tmem pools for %s=%d\n", + cli_id_str,client->cli_id); + client_flush(client,1); + } + + if ( tmh_lock_all ) + spin_unlock(&tmem_spinlock); + else + write_unlock(&tmem_rwlock); +} + +/* freezing all pools guarantees that no additional memory will be consumed */ +EXPORT void tmem_freeze_all(unsigned char key) +{ + static int freeze = 0; + + if ( tmh_lock_all ) + spin_lock(&tmem_spinlock); + else + write_lock(&tmem_rwlock); + + freeze = !freeze; + tmemc_freeze_pools(CLI_ID_NULL,freeze); + + if ( tmh_lock_all ) + spin_unlock(&tmem_spinlock); + else + write_unlock(&tmem_rwlock); +} + +#define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */ + +EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags) +{ + pfp_t *pfp; + unsigned long evicts_per_relinq = 0; + int max_evictions = 10; + + if (!tmh_enabled()) + return NULL; +#ifdef __i386__ + return NULL; +#endif + + relinq_attempts++; + if ( order > 0 ) + { + printk("tmem_relinquish_page: failing order=%d\n", order); + return NULL; + } + + if ( tmh_called_from_tmem(memflags) ) + { + if ( tmh_lock_all ) + spin_lock(&tmem_spinlock); + else + read_lock(&tmem_rwlock); + } + + while ( (pfp = tmh_alloc_page(NULL,1)) == NULL ) + { + if ( (max_evictions-- <= 0) || !tmem_evict()) + break; + evicts_per_relinq++; + } + if ( evicts_per_relinq > max_evicts_per_relinq ) + max_evicts_per_relinq = evicts_per_relinq; + tmh_scrub_page(pfp, memflags); + if ( pfp != NULL ) + relinq_pgs++; + + if ( tmh_called_from_tmem(memflags) ) + { + if ( tmh_lock_all ) + spin_unlock(&tmem_spinlock); + else + read_unlock(&tmem_rwlock); + } + + return pfp; +} + +/* called at hypervisor startup */ +EXPORT void init_tmem(void) +{ + if ( !tmh_enabled() ) + return; + + radix_tree_init(); + if ( tmh_init() ) + { + printk("tmem: initialized comp=%d global-lock=%d\n", + tmh_compression_enabled(), tmh_lock_all); + tmem_initialized = 1; + } + else + printk("tmem: initialization FAILED\n"); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/common/tmem_xen.c b/xen/common/tmem_xen.c new file mode 100644 index 0000000000..6a0b14f456 --- /dev/null +++ b/xen/common/tmem_xen.c @@ -0,0 +1,334 @@ +/****************************************************************************** + * tmem-xen.c + * + * Xen-specific Transcendent memory + * + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. + */ + +#include <xen/tmem.h> +#include <xen/tmem_xen.h> +#include <xen/lzo.h> /* compression code */ +#include <xen/paging.h> +#include <xen/domain_page.h> + +#define EXPORT /* indicates code other modules are dependent upon */ + +EXPORT int opt_tmem = 0; +boolean_param("tmem", opt_tmem); + +EXPORT int opt_tmem_compress = 0; +boolean_param("tmem_compress", opt_tmem_compress); + +EXPORT int opt_tmem_lock = 0; +integer_param("tmem_lock", opt_tmem_lock); + +#ifdef COMPARE_COPY_PAGE_SSE2 +DECL_CYC_COUNTER(pg_copy1); +DECL_CYC_COUNTER(pg_copy2); +DECL_CYC_COUNTER(pg_copy3); +DECL_CYC_COUNTER(pg_copy4); +#else +DECL_CYC_COUNTER(pg_copy); +#endif + +/* these are a concurrency bottleneck, could be percpu and dynamically + * allocated iff opt_tmem_compress */ +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS +#define LZO_DSTMEM_PAGES 2 +static DEFINE_PER_CPU(unsigned char *, workmem); +static DEFINE_PER_CPU(unsigned char *, dstmem); + +#ifdef COMPARE_COPY_PAGE_SSE2 +#include <asm/flushtlb.h> /* REMOVE ME AFTER TEST */ +#include <asm/page.h> /* REMOVE ME AFTER TEST */ +#endif +void tmh_copy_page(char *to, char*from) +{ +#ifdef COMPARE_COPY_PAGE_SSE2 + DECL_LOCAL_CYC_COUNTER(pg_copy1); + DECL_LOCAL_CYC_COUNTER(pg_copy2); + DECL_LOCAL_CYC_COUNTER(pg_copy3); + DECL_LOCAL_CYC_COUNTER(pg_copy4); + *to = *from; /* don't measure TLB misses */ + flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0)); + flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0)); + START_CYC_COUNTER(pg_copy1); + copy_page_sse2(to, from); /* cold cache */ + END_CYC_COUNTER(pg_copy1); + START_CYC_COUNTER(pg_copy2); + copy_page_sse2(to, from); /* hot cache */ + END_CYC_COUNTER(pg_copy2); + flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0)); + flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0)); + START_CYC_COUNTER(pg_copy3); + memcpy(to, from, PAGE_SIZE); /* cold cache */ + END_CYC_COUNTER(pg_copy3); + START_CYC_COUNTER(pg_copy4); + memcpy(to, from, PAGE_SIZE); /* hot cache */ + END_CYC_COUNTER(pg_copy4); +#else + DECL_LOCAL_CYC_COUNTER(pg_copy); + START_CYC_COUNTER(pg_copy); + memcpy(to, from, PAGE_SIZE); + END_CYC_COUNTER(pg_copy); +#endif +} + +#ifdef __ia64__ +static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn) +{ + ASSERT(0); +} +#define paging_mark_dirty(_x,_y) do {} while(0) +#else +static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn) +{ + unsigned long cli_mfn; + p2m_type_t t; + + + if (is_pv_32on64_vcpu(current)) + cmfn.p = (void *)((unsigned long)cmfn.p & 0xffffffffUL); + cli_mfn = mfn_x(gfn_to_mfn(current->domain,(unsigned long)cmfn.p,&t)); + if (t != p2m_ram_rw) + return NULL; + if (pcli_mfn != NULL) + *pcli_mfn = cli_mfn; + return map_domain_page(cli_mfn); +} +#endif + +EXPORT int tmh_copy_from_client(pfp_t *pfp, + tmem_cli_mfn_t cmfn, uint32_t tmem_offset, + uint32_t pfn_offset, uint32_t len) +{ + unsigned long tmem_mfn; + void *tmem_va, *cli_va = NULL; + + ASSERT(pfp != NULL); + if ( tmem_offset || pfn_offset || len ) + if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL) + return -EFAULT; + tmem_mfn = page_to_mfn(pfp); + tmem_va = map_domain_page(tmem_mfn); + mb(); + if (!len && !tmem_offset && !pfn_offset) + memset(tmem_va, 0, PAGE_SIZE); + else if (len == PAGE_SIZE && !tmem_offset && !pfn_offset) + tmh_copy_page(tmem_va, cli_va); + else if ( (tmem_offset+len <= PAGE_SIZE) && + (pfn_offset+len <= PAGE_SIZE) ) + memcpy((char *)tmem_va+tmem_offset,(char *)cli_va+pfn_offset,len); + unmap_domain_page(cli_va); + unmap_domain_page(tmem_va); + return 1; +} + +EXPORT int tmh_compress_from_client(tmem_cli_mfn_t cmfn, + void **out_va, size_t *out_len) +{ + void *cli_va; + int ret = 0; + unsigned char *dmem = this_cpu(dstmem); + unsigned char *wmem = this_cpu(workmem); + + if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL) + return -EFAULT; + if ( dmem == NULL || wmem == NULL ) + return 0; /* no buffer, so can't compress */ + mb(); + ret = lzo1x_1_compress(cli_va, PAGE_SIZE, dmem, out_len, wmem); + ASSERT(ret == LZO_E_OK); + *out_va = dmem; + unmap_domain_page(cli_va); + return 1; +} + +EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, + uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len) +{ + unsigned long tmem_mfn, cli_mfn; + void *tmem_va, *cli_va; + + ASSERT(pfp != NULL); + if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL) + return -EFAULT; + tmem_mfn = page_to_mfn(pfp); + tmem_va = map_domain_page(tmem_mfn); + if (len == PAGE_SIZE && !tmem_offset && !pfn_offset) + tmh_copy_page(cli_va, tmem_va); + else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) ) + memcpy((char *)cli_va+pfn_offset,(char *)tmem_va+tmem_offset,len); + unmap_domain_page(tmem_va); + unmap_domain_page(cli_va); + paging_mark_dirty(current->domain,cli_mfn); + mb(); + return 1; +} + +EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, size_t size) +{ + unsigned long cli_mfn; + void *cli_va; + size_t out_len = PAGE_SIZE; + int ret; + + if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL) + return -EFAULT; + ret = lzo1x_decompress_safe(tmem_va, size, cli_va, &out_len); + ASSERT(ret == LZO_E_OK); + ASSERT(out_len == PAGE_SIZE); + unmap_domain_page(cli_va); + paging_mark_dirty(current->domain,cli_mfn); + mb(); + return 1; +} + +/****************** XEN-SPECIFIC MEMORY ALLOCATION ********************/ + +EXPORT struct xmem_pool *tmh_mempool = 0; +EXPORT unsigned int tmh_mempool_maxalloc = 0; + +EXPORT DEFINE_SPINLOCK(tmh_page_list_lock); +EXPORT PAGE_LIST_HEAD(tmh_page_list); +EXPORT unsigned long tmh_page_list_pages = 0; + +/* free anything on tmh_page_list to Xen's scrub list */ +EXPORT void tmh_release_avail_pages_to_host(void) +{ + spin_lock(&tmh_page_list_lock); + if ( !page_list_empty(&tmh_page_list) ) + { + scrub_list_splice(&tmh_page_list); + INIT_PAGE_LIST_HEAD(&tmh_page_list); + } + spin_unlock(&tmh_page_list_lock); +} + +EXPORT void tmh_scrub_page(struct page_info *pi, unsigned int memflags) +{ + if ( pi == NULL ) + return; + if ( !(memflags & MEMF_tmem) ) + scrub_one_page(pi); +} + +#ifndef __i386__ +static noinline void *tmh_mempool_page_get(unsigned long size) +{ + struct page_info *pi; + + ASSERT(size == PAGE_SIZE); + if ( (pi = tmh_alloc_page(NULL,0)) == NULL ) + return NULL; + ASSERT(IS_VALID_PAGE(pi)); + return page_to_virt(pi); +} + +static void tmh_mempool_page_put(void *page_va) +{ + ASSERT(IS_PAGE_ALIGNED(page_va)); + tmh_free_page(virt_to_page(page_va)); +} + +static int tmh_mempool_init(void) +{ + tmh_mempool = xmem_pool_create("tmem", tmh_mempool_page_get, + tmh_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE); + if ( tmh_mempool ) + tmh_mempool_maxalloc = xmem_pool_maxalloc(tmh_mempool); + return tmh_mempool != NULL; +} + +/* persistent pools are per-domain */ + +static void *tmh_persistent_pool_page_get(unsigned long size) +{ + struct page_info *pi; + struct domain *d = current->domain; + + ASSERT(size == PAGE_SIZE); + if ( (pi = _tmh_alloc_page_thispool(d)) == NULL ) + return NULL; + ASSERT(IS_VALID_PAGE(pi)); + return map_domain_page(page_to_mfn(pi)); +} + +static void tmh_persistent_pool_page_put(void *page_va) +{ + struct page_info *pi; + + ASSERT(IS_PAGE_ALIGNED(page_va)); + pi = virt_to_page(page_va); + ASSERT(IS_VALID_PAGE(pi)); + _tmh_free_page_thispool(pi); +} +#endif + +/****************** XEN-SPECIFIC CLIENT HANDLING ********************/ + +EXPORT tmh_client_t *tmh_client_init(void) +{ + tmh_client_t *tmh; + char name[5]; + domid_t domid = current->domain->domain_id; + int i, shift; + + if ( (tmh = xmalloc(tmh_client_t)) == NULL ) + return NULL; + for (i = 0, shift = 12; i < 4; shift -=4, i++) + name[i] = ((unsigned short)domid >> shift) & 0xf; + name[4] = '\0'; +#ifndef __i386__ + tmh->persistent_pool = xmem_pool_create(name, tmh_persistent_pool_page_get, + tmh_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE); + if ( tmh->persistent_pool == NULL ) + { + xfree(tmh); + return NULL; + } +#endif + tmh->domain = current->domain; + return tmh; +} + +EXPORT void tmh_client_destroy(tmh_client_t *tmh) +{ +#ifndef __i386__ + xmem_pool_destroy(tmh->persistent_pool); +#endif + xfree(tmh); +} + +/****************** XEN-SPECIFIC HOST INITIALIZATION ********************/ + +EXPORT int tmh_init(void) +{ +#ifndef __i386__ + int dstmem_order, workmem_order; + bool_t bad_alloc = 0; + struct page_info *pi; + unsigned char *p1, *p2; + int cpu; + + if ( !tmh_mempool_init() ) + return 0; + + dstmem_order = get_order_from_pages(LZO_DSTMEM_PAGES); + workmem_order = get_order_from_bytes(LZO1X_1_MEM_COMPRESS); + for_each_cpu ( cpu ) + { + pi = alloc_domheap_pages(0,dstmem_order,0); + per_cpu(dstmem, cpu) = p1 = ((pi == NULL) ? NULL : page_to_virt(pi)); + pi = alloc_domheap_pages(0,workmem_order,0); + per_cpu(workmem, cpu) = p2 = ((pi == NULL) ? NULL : page_to_virt(pi)); + if ( (p1 == NULL) || (p2 == NULL) ) + bad_alloc++; + } + if ( bad_alloc ) + printk("tmem: can't allocate compression buffers for %d cpus\n", + bad_alloc); +#endif + return 1; +} diff --git a/xen/common/xmalloc_tlsf.c b/xen/common/xmalloc_tlsf.c index 7a476e8fb7..3f85389e23 100644 --- a/xen/common/xmalloc_tlsf.c +++ b/xen/common/xmalloc_tlsf.c @@ -292,7 +292,6 @@ struct xmem_pool *xmem_pool_create( unsigned long grow_size) { struct xmem_pool *pool; - void *region; int pool_bytes, pool_order; BUG_ON(max_size && (max_size < init_size)); @@ -319,11 +318,9 @@ struct xmem_pool *xmem_pool_create( pool->get_mem = get_mem; pool->put_mem = put_mem; strlcpy(pool->name, name, sizeof(pool->name)); - region = get_mem(init_size); - if ( region == NULL ) - goto out_region; - ADD_REGION(region, init_size, pool); - pool->init_region = region; + + /* always obtain init_region lazily now to ensure it is get_mem'd + * in the same "context" as all other regions */ spin_lock_init(&pool->lock); @@ -332,10 +329,6 @@ struct xmem_pool *xmem_pool_create( spin_unlock(&pool_list_lock); return pool; - - out_region: - free_xenheap_pages(pool, pool_order); - return NULL; } unsigned long xmem_pool_get_used_size(struct xmem_pool *pool) @@ -354,13 +347,15 @@ unsigned long xmem_pool_get_total_size(struct xmem_pool *pool) void xmem_pool_destroy(struct xmem_pool *pool) { + int pool_bytes, pool_order; + if ( pool == NULL ) return; /* User is destroying without ever allocating from this pool */ if ( xmem_pool_get_used_size(pool) == BHDR_OVERHEAD ) { - pool->put_mem(pool->init_region); + ASSERT(!pool->init_region); pool->used_size -= BHDR_OVERHEAD; } @@ -373,7 +368,10 @@ void xmem_pool_destroy(struct xmem_pool *pool) spin_lock(&pool_list_lock); list_del_init(&pool->list); spin_unlock(&pool_list_lock); - pool->put_mem(pool); + + pool_bytes = ROUNDUP_SIZE(sizeof(*pool)); + pool_order = get_order_from_bytes(pool_bytes); + free_xenheap_pages(pool,pool_order); } void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool) @@ -382,6 +380,14 @@ void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool) int fl, sl; unsigned long tmp_size; + if ( pool->init_region == NULL ) + { + if ( (region = pool->get_mem(pool->init_size)) == NULL ) + goto out; + ADD_REGION(region, pool->init_size, pool); + pool->init_region = region; + } + size = (size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : ROUNDUP_SIZE(size); /* Rounding up the requested size and calculating fl and sl */ @@ -496,6 +502,11 @@ void xmem_pool_free(void *ptr, struct xmem_pool *pool) spin_unlock(&pool->lock); } +int xmem_pool_maxalloc(struct xmem_pool *pool) +{ + return pool->grow_size - (2 * BHDR_OVERHEAD); +} + /* * Glue for xmalloc(). */ diff --git a/xen/include/Makefile b/xen/include/Makefile index 8427371596..15acff963b 100644 --- a/xen/include/Makefile +++ b/xen/include/Makefile @@ -14,6 +14,7 @@ headers-y := \ compat/physdev.h \ compat/platform.h \ compat/sched.h \ + compat/tmem.h \ compat/trace.h \ compat/vcpu.h \ compat/version.h \ diff --git a/xen/include/asm-ia64/mm.h b/xen/include/asm-ia64/mm.h index bb3dc8ae5b..e8803ba9b1 100644 --- a/xen/include/asm-ia64/mm.h +++ b/xen/include/asm-ia64/mm.h @@ -590,6 +590,8 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg); int steal_page( struct domain *d, struct page_info *page, unsigned int memflags); +int donate_page( + struct domain *d, struct page_info *page, unsigned int memflags); #define domain_clamp_alloc_bitsize(d, b) (b) diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 6772b40505..4900aa9b13 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -487,6 +487,8 @@ int compat_subarch_memory_op(int op, XEN_GUEST_HANDLE(void)); int steal_page( struct domain *d, struct page_info *page, unsigned int memflags); +int donate_page( + struct domain *d, struct page_info *page, unsigned int memflags); int map_ldt_shadow_page(unsigned int); diff --git a/xen/include/asm-x86/spinlock.h b/xen/include/asm-x86/spinlock.h index f1a5feb03c..35ed9759e7 100644 --- a/xen/include/asm-x86/spinlock.h +++ b/xen/include/asm-x86/spinlock.h @@ -32,10 +32,10 @@ static always_inline int _raw_spin_trylock(raw_spinlock_t *lock) } typedef struct { - volatile unsigned int lock; + volatile int lock; } raw_rwlock_t; -#define RW_LOCK_BIAS 0x01000000 +#define RW_LOCK_BIAS 0x01000000 #define _RAW_RW_LOCK_UNLOCKED /*(raw_rwlock_t)*/ { RW_LOCK_BIAS } static always_inline void _raw_read_lock(raw_rwlock_t *rw) @@ -66,6 +66,22 @@ static always_inline void _raw_write_lock(raw_rwlock_t *rw) : "=m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory" ); } +static always_inline int _raw_write_trylock(raw_rwlock_t *rw) +{ + int rc; + + asm volatile ( + " lock; subl %2,%0 \n" + " jz 1f \n" + " lock; addl %2,%0 \n" + " dec %1 \n" + "1:" + : "=m" (rw->lock), "=r" (rc) : "i" (RW_LOCK_BIAS), "1" (1) + : "memory" ); + + return rc; +} + static always_inline void _raw_read_unlock(raw_rwlock_t *rw) { asm volatile ( @@ -81,5 +97,6 @@ static always_inline void _raw_write_unlock(raw_rwlock_t *rw) } #define _raw_rw_is_locked(x) ((x)->lock < RW_LOCK_BIAS) +#define _raw_rw_is_write_locked(x) ((x)->lock <= 0) #endif /* __ASM_SPINLOCK_H */ diff --git a/xen/include/public/tmem.h b/xen/include/public/tmem.h new file mode 100644 index 0000000000..b8d608f591 --- /dev/null +++ b/xen/include/public/tmem.h @@ -0,0 +1,112 @@ +/****************************************************************************** + * tmem.h + * + * Guest OS interface to Xen Transcendent Memory. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __XEN_PUBLIC_TMEM_H__ +#define __XEN_PUBLIC_TMEM_H__ + +#include "xen.h" + +/* Commands to HYPERVISOR_tmem_op() */ +#define TMEM_CONTROL 0 +#define TMEM_NEW_POOL 1 +#define TMEM_DESTROY_POOL 2 +#define TMEM_NEW_PAGE 3 +#define TMEM_PUT_PAGE 4 +#define TMEM_GET_PAGE 5 +#define TMEM_FLUSH_PAGE 6 +#define TMEM_FLUSH_OBJECT 7 +#define TMEM_READ 8 +#define TMEM_WRITE 9 +#define TMEM_XCHG 10 + +/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */ +#define TMEMC_THAW 0 +#define TMEMC_FREEZE 1 +#define TMEMC_FLUSH 2 +#define TMEMC_DESTROY 3 +#define TMEMC_LIST 4 +#define TMEMC_SET_WEIGHT 5 +#define TMEMC_SET_CAP 6 +#define TMEMC_SET_COMPRESS 7 + +/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 +#define TMEM_POOL_PAGESIZE_SHIFT 4 +#define TMEM_POOL_PAGESIZE_MASK 0xf +#define TMEM_POOL_VERSION_SHIFT 24 +#define TMEM_POOL_VERSION_MASK 0xff + +/* Special errno values */ +#define EFROZEN 1000 +#define EEMPTY 1001 + + +#ifndef __ASSEMBLY__ +typedef XEN_GUEST_HANDLE(void) tmem_cli_mfn_t; +typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t; +struct tmem_op { + uint32_t cmd; + int32_t pool_id; /* private > 0; shared < 0; 0 is invalid */ + union { + struct { /* for cmd == TMEM_NEW_POOL */ + uint64_t uuid[2]; + uint32_t flags; + }; + struct { /* for cmd == TMEM_CONTROL */ + uint32_t subop; + uint32_t cli_id; + uint32_t arg1; + uint32_t arg2; + tmem_cli_va_t buf; + }; + struct { + uint64_t object; + uint32_t index; + uint32_t tmem_offset; + uint32_t pfn_offset; + uint32_t len; + tmem_cli_mfn_t cmfn; /* client machine page frame */ + }; + }; +}; +typedef struct tmem_op tmem_op_t; +DEFINE_XEN_GUEST_HANDLE(tmem_op_t); +typedef XEN_GUEST_HANDLE_64(tmem_op_t) tmem_cli_op_t; + +#endif + +#endif /* __XEN_PUBLIC_TMEM_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h index 524118b6d7..72aa667430 100644 --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -91,6 +91,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); #define __HYPERVISOR_sysctl 35 #define __HYPERVISOR_domctl 36 #define __HYPERVISOR_kexec_op 37 +#define __HYPERVISOR_tmem_op 38 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 diff --git a/xen/include/xen/config.h b/xen/include/xen/config.h index 9b26d882bb..7872f13e8d 100644 --- a/xen/include/xen/config.h +++ b/xen/include/xen/config.h @@ -11,7 +11,6 @@ #define EXPORT_SYMBOL(var) #define EXPORT_SYMBOL_GPL(var) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x)) /* * The following log levels are as follows: diff --git a/xen/include/xen/hash.h b/xen/include/xen/hash.h new file mode 100644 index 0000000000..0658c8b619 --- /dev/null +++ b/xen/include/xen/hash.h @@ -0,0 +1,58 @@ +#ifndef _XEN_HASH_H +#define _XEN_HASH_H +/* Fast hashing routine for a long. + (C) 2002 William Lee Irwin III, IBM */ + +/* + * Knuth recommends primes in approximately golden ratio to the maximum + * integer representable by a machine word for multiplicative hashing. + * Chuck Lever verified the effectiveness of this technique: + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf + * + * These primes are chosen to be bit-sparse, that is operations on + * them can use shifts and additions instead of multiplications for + * machines where multiplications are slow. + */ +#if BITS_PER_LONG == 32 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e370001UL +#elif BITS_PER_LONG == 64 +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL +#else +#error Define GOLDEN_RATIO_PRIME for your wordsize. +#endif + +static inline unsigned long hash_long(unsigned long val, unsigned int bits) +{ + unsigned long hash = val; + +#if BITS_PER_LONG == 64 + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + unsigned long n = hash; + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; +#else + /* On some cpus multiply is faster, on others gcc will do shifts */ + hash *= GOLDEN_RATIO_PRIME; +#endif + + /* High bits are more random, so use them. */ + return hash >> (BITS_PER_LONG - bits); +} + +static inline unsigned long hash_ptr(void *ptr, unsigned int bits) +{ + return hash_long((unsigned long)ptr, bits); +} +#endif /* _XEN_HASH_H */ diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h index 43758b9de8..f2e0150fa1 100644 --- a/xen/include/xen/hypercall.h +++ b/xen/include/xen/hypercall.h @@ -13,6 +13,7 @@ #include <public/sysctl.h> #include <public/platform.h> #include <public/event_channel.h> +#include <public/tmem.h> #include <asm/hypercall.h> #include <xsm/xsm.h> @@ -116,6 +117,10 @@ extern long do_xsm_op( XEN_GUEST_HANDLE(xsm_op_t) u_xsm_op); +extern long +do_tmem_op( + XEN_GUEST_HANDLE(tmem_op_t) uops); + #ifdef CONFIG_COMPAT extern int diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h index ab76120da6..6b5033a36e 100644 --- a/xen/include/xen/lib.h +++ b/xen/include/xen/lib.h @@ -45,6 +45,8 @@ do { \ #define DIV_ROUND(x, y) (((x) + (y) / 2) / (y)) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x)) + #define reserve_bootmem(_p,_l) ((void)0) struct domain; diff --git a/xen/include/xen/lzo.h b/xen/include/xen/lzo.h new file mode 100644 index 0000000000..cbf135f984 --- /dev/null +++ b/xen/include/xen/lzo.h @@ -0,0 +1,44 @@ +#ifndef __LZO_H__ +#define __LZO_H__ +/* + * LZO Public Kernel Interface + * A mini subset of the LZO real-time data compression library + * + * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com> + * + * The full LZO package can be found at: + * http://www.oberhumer.com/opensource/lzo/ + * + * Changed for kernel use by: + * Nitin Gupta <nitingupta910@gmail.com> + * Richard Purdie <rpurdie@openedhand.com> + */ + +#define LZO1X_MEM_COMPRESS (16384 * sizeof(unsigned char *)) +#define LZO1X_1_MEM_COMPRESS LZO1X_MEM_COMPRESS + +#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3) + +/* This requires 'workmem' of size LZO1X_1_MEM_COMPRESS */ +int lzo1x_1_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + +/* safe decompression with overrun testing */ +int lzo1x_decompress_safe(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len); + +/* + * Return values (< 0 = Error) + */ +#define LZO_E_OK 0 +#define LZO_E_ERROR (-1) +#define LZO_E_OUT_OF_MEMORY (-2) +#define LZO_E_NOT_COMPRESSIBLE (-3) +#define LZO_E_INPUT_OVERRUN (-4) +#define LZO_E_OUTPUT_OVERRUN (-5) +#define LZO_E_LOOKBEHIND_OVERRUN (-6) +#define LZO_E_EOF_NOT_FOUND (-7) +#define LZO_E_INPUT_NOT_CONSUMED (-8) +#define LZO_E_NOT_YET_IMPLEMENTED (-9) + +#endif diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h index 82340f3ae4..50c47b00e2 100644 --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -77,6 +77,8 @@ int assign_pages( #define MEMF_no_refcount (1U<<_MEMF_no_refcount) #define _MEMF_populate_on_demand 1 #define MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand) +#define _MEMF_tmem 2 +#define MEMF_tmem (1U<<_MEMF_tmem) #define _MEMF_node 8 #define MEMF_node(n) ((((n)+1)&0xff)<<_MEMF_node) #define _MEMF_bits 24 @@ -222,6 +224,32 @@ page_list_remove_head(struct page_list_head *head) return page; } +static inline void +page_list_splice(struct page_list_head *list, struct page_list_head *head) +{ + struct page_info *first, *last, *at; + + if ( page_list_empty(list) ) + return; + + if ( page_list_empty(head) ) + { + head->next = list->next; + head->tail = list->tail; + return; + } + + first = list->next; + last = list->tail; + at = head->next; + + first->list.prev = page_to_mfn(head->next); + head->next = first; + + last->list.next = page_to_mfn(at); + at->list.prev = page_to_mfn(last); +} + #define page_list_for_each(pos, head) \ for ( pos = (head)->next; pos; pos = page_list_next(pos, head) ) #define page_list_for_each_safe(pos, tmp, head) \ @@ -258,6 +286,7 @@ page_list_remove_head(struct page_list_head *head) list_for_each_entry_safe(pos, tmp, head, list) # define page_list_for_each_safe_reverse(pos, tmp, head) \ list_for_each_entry_safe_reverse(pos, tmp, head, list) +# define page_list_splice(list, hd) list_splice(list, hd) #endif /* Automatic page scrubbing for dead domains. */ @@ -272,6 +301,9 @@ extern struct page_list_head page_scrub_list; if ( !page_list_empty(&page_scrub_list) ) \ cpumask_raise_softirq(cpu_online_map, PAGE_SCRUB_SOFTIRQ); \ } while ( 0 ) +void scrub_list_splice(struct page_list_head *); +void scrub_list_add(struct page_info *); +void scrub_one_page(struct page_info *); unsigned long avail_scrub_pages(void); int guest_remove_page(struct domain *d, unsigned long gmfn); diff --git a/xen/include/xen/radix-tree.h b/xen/include/xen/radix-tree.h new file mode 100644 index 0000000000..d4bb4e8992 --- /dev/null +++ b/xen/include/xen/radix-tree.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * Adapted for Xen by Dan Magenheimer, Oracle Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef _XEN_RADIX_TREE_H +#define _XEN_RADIX_TREE_H + +/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ +struct radix_tree_root { + unsigned int height; + struct radix_tree_node *rnode; +}; + +#define RADIX_TREE_MAP_SHIFT 6 + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) + +struct radix_tree_node { + unsigned int count; + void *slots[RADIX_TREE_MAP_SIZE]; +}; + +struct radix_tree_path { + struct radix_tree_node *node; + int offset; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) + + +#define RADIX_TREE_INIT(mask) { \ + .height = 0, \ + .rnode = NULL, \ +} + +#define RADIX_TREE(name, mask) \ + struct radix_tree_root name = RADIX_TREE_INIT(mask) + +#define INIT_RADIX_TREE(root, mask) \ +do { \ + (root)->height = 0; \ + (root)->rnode = NULL; \ +} while (0) + +int radix_tree_insert(struct radix_tree_root *root, unsigned long index, + void *item, struct radix_tree_node *(*node_alloc)(void *), void *arg); +void *radix_tree_lookup(struct radix_tree_root *, unsigned long); +void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void radix_tree_destroy(struct radix_tree_root *root, + void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *)); +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index, + void(*node_free)(struct radix_tree_node *)); +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items); +void radix_tree_init(void); + +#endif /* _XEN_RADIX_TREE_H */ diff --git a/xen/include/xen/rbtree.h b/xen/include/xen/rbtree.h new file mode 100644 index 0000000000..b16dc5036f --- /dev/null +++ b/xen/include/xen/rbtree.h @@ -0,0 +1,82 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli <andrea@suse.de> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RBTREE_H__ +#define __RBTREE_H__ + +struct rb_node +{ + unsigned long rb_parent_color; +#define RB_RED 0 +#define RB_BLACK 1 + struct rb_node *rb_right; + struct rb_node *rb_left; +}; + +struct rb_root +{ + struct rb_node *rb_node; +}; + +#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) +#define rb_color(r) ((r)->rb_parent_color & 1) +#define rb_is_red(r) (!rb_color(r)) +#define rb_is_black(r) rb_color(r) +#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) +#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; +} +static inline void rb_set_color(struct rb_node *rb, int color) +{ + rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; +} + +#define RB_ROOT (struct rb_root) { NULL, } +#define rb_entry(ptr, type, member) container_of(ptr, type, member) + +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) + +extern void rb_insert_color(struct rb_node *, struct rb_root *); +extern void rb_erase(struct rb_node *, struct rb_root *); + +/* Find logical next and previous nodes in a tree */ +extern struct rb_node *rb_next(struct rb_node *); +extern struct rb_node *rb_prev(struct rb_node *); +extern struct rb_node *rb_first(struct rb_root *); +extern struct rb_node *rb_last(struct rb_root *); + +/* Fast replacement of a single node without remove/rebalance/add/rebalance */ +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root); + +static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, + struct rb_node ** rb_link) +{ + node->rb_parent_color = (unsigned long )parent; + node->rb_left = node->rb_right = NULL; + + *rb_link = node; +} + +#endif /* __RBTREE_H__ */ diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 46731a5e98..7ab04a3343 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -269,6 +269,9 @@ struct domain /* VRAM dirty support. */ struct sh_dirty_vram *dirty_vram; + + /* transcendent memory, auto-allocated on first tmem op by each domain */ + void *tmem; }; struct domain_setup_info diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h index 7a5a5ab5d7..a952f0700d 100644 --- a/xen/include/xen/spinlock.h +++ b/xen/include/xen/spinlock.h @@ -67,12 +67,14 @@ void _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags); void _write_lock(rwlock_t *lock); void _write_lock_irq(rwlock_t *lock); unsigned long _write_lock_irqsave(rwlock_t *lock); +int _write_trylock(rwlock_t *lock); void _write_unlock(rwlock_t *lock); void _write_unlock_irq(rwlock_t *lock); void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags); int _rw_is_locked(rwlock_t *lock); +int _rw_is_write_locked(rwlock_t *lock); #define spin_lock(l) _spin_lock(l) #define spin_lock_irq(l) _spin_lock_irq(l) @@ -110,11 +112,13 @@ int _rw_is_locked(rwlock_t *lock); #define write_lock(l) _write_lock(l) #define write_lock_irq(l) _write_lock_irq(l) #define write_lock_irqsave(l, f) ((f) = _write_lock_irqsave(l)) +#define write_trylock(l) _write_trylock(l) #define write_unlock(l) _write_unlock(l) #define write_unlock_irq(l) _write_unlock_irq(l) #define write_unlock_irqrestore(l, f) _write_unlock_irqrestore(l, f) #define rw_is_locked(l) _rw_is_locked(l) +#define rw_is_write_locked(l) _rw_is_write_locked(l) #endif /* __SPINLOCK_H__ */ diff --git a/xen/include/xen/tmem.h b/xen/include/xen/tmem.h new file mode 100644 index 0000000000..ff009b6cf0 --- /dev/null +++ b/xen/include/xen/tmem.h @@ -0,0 +1,16 @@ +/****************************************************************************** + * tmem.h + * + * Transcendent memory + * + * Copyright (c) 2008, Dan Magenheimer, Oracle Corp. + */ + +#ifndef __XEN_TMEM_H__ +#define __XEN_TMEM_H__ + +extern void init_tmem(void); +extern void tmem_destroy(void *); +extern void *tmem_relinquish_pages(unsigned int, unsigned int); + +#endif /* __XEN_TMEM_H__ */ diff --git a/xen/include/xen/tmem_xen.h b/xen/include/xen/tmem_xen.h new file mode 100644 index 0000000000..8d653c27df --- /dev/null +++ b/xen/include/xen/tmem_xen.h @@ -0,0 +1,356 @@ +/****************************************************************************** + * tmem_xen.h + * + * Xen-specific Transcendent memory + * + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. + */ + +#ifndef __XEN_TMEM_XEN_H__ +#define __XEN_TMEM_XEN_H__ + +#include <xen/config.h> +#include <xen/mm.h> /* heap alloc/free */ +#include <xen/xmalloc.h> /* xmalloc/xfree */ +#include <xen/sched.h> /* struct domain */ +#include <xen/guest_access.h> /* copy_from_guest */ +#include <xen/hash.h> /* hash_long */ +#include <public/tmem.h> + +struct tmem_host_dependent_client { + struct domain *domain; + struct xmem_pool *persistent_pool; +}; +typedef struct tmem_host_dependent_client tmh_client_t; + +#define IS_PAGE_ALIGNED(addr) \ + ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr) +#define IS_VALID_PAGE(_pi) ( mfn_valid(page_to_mfn(_pi)) ) + +extern struct xmem_pool *tmh_mempool; +extern unsigned int tmh_mempool_maxalloc; +extern struct page_list_head tmh_page_list; +extern spinlock_t tmh_page_list_lock; +extern unsigned long tmh_page_list_pages; + +extern spinlock_t tmem_lock; +extern spinlock_t tmem_spinlock; +extern rwlock_t tmem_rwlock; + +extern void tmh_copy_page(char *to, char*from); +extern int tmh_init(void); +extern tmh_client_t *tmh_client_init(void); +extern void tmh_client_destroy(tmh_client_t *); +#define tmh_hash hash_long + +extern void tmh_release_avail_pages_to_host(void); +extern void tmh_scrub_page(struct page_info *pi, unsigned int memflags); + +extern int opt_tmem_compress; +static inline int tmh_compression_enabled(void) +{ + return opt_tmem_compress; +} + +extern int opt_tmem; +static inline int tmh_enabled(void) +{ + return opt_tmem; +} + +extern int opt_tmem_lock; + +extern int opt_tmem_flush_dups; + +/* + * Memory free page list management + */ + +static inline struct page_info *tmh_page_list_get(void) +{ + struct page_info *pi; + + spin_lock(&tmh_page_list_lock); + if ( (pi = page_list_remove_head(&tmh_page_list)) != NULL ) + tmh_page_list_pages--; + spin_unlock(&tmh_page_list_lock); + ASSERT((pi == NULL) || IS_VALID_PAGE(pi)); + return pi; +} + +static inline void tmh_page_list_put(struct page_info *pi) +{ + ASSERT(IS_VALID_PAGE(pi)); + spin_lock(&tmh_page_list_lock); + page_list_add(pi, &tmh_page_list); + tmh_page_list_pages++; + spin_unlock(&tmh_page_list_lock); +} + +static inline unsigned long tmh_avail_pages(void) +{ + return tmh_page_list_pages; +} + +/* + * Ephemeral memory allocation for persistent data + */ + +static inline bool_t domain_fully_allocated(struct domain *d) +{ + return ( d->tot_pages >= d->max_pages ); +} +#define tmh_client_memory_fully_allocated(_pool) \ + domain_fully_allocated(_pool->client->tmh->domain) + +static inline void *_tmh_alloc_subpage_thispool(struct xmem_pool *cmem_mempool, + size_t size, size_t align) +{ +#if 0 + if ( d->tot_pages >= d->max_pages ) + return NULL; +#endif +#ifdef __i386__ + return _xmalloc(size,align); +#else + ASSERT( size < tmh_mempool_maxalloc ); + if ( cmem_mempool == NULL ) + return NULL; + return xmem_pool_alloc(size, cmem_mempool); +#endif +} +#define tmh_alloc_subpage_thispool(_pool, _s, _a) \ + _tmh_alloc_subpage_thispool(pool->client->tmh->persistent_pool, \ + _s, _a) + +static inline void _tmh_free_subpage_thispool(struct xmem_pool *cmem_mempool, + void *ptr, size_t size) +{ +#ifdef __i386__ + xfree(ptr); +#else + ASSERT( size < tmh_mempool_maxalloc ); + ASSERT( cmem_mempool != NULL ); + xmem_pool_free(ptr,cmem_mempool); +#endif +} +#define tmh_free_subpage_thispool(_pool, _p, _s) \ + _tmh_free_subpage_thispool(_pool->client->tmh->persistent_pool, _p, _s) + +static inline struct page_info *_tmh_alloc_page_thispool(struct domain *d) +{ + struct page_info *pi; + + /* note that this tot_pages check is not protected by d->page_alloc_lock, + * so may race and periodically fail in donate_page or alloc_domheap_pages + * That's OK... neither is a problem, though chatty if log_lvl is set */ + if ( d->tot_pages >= d->max_pages ) + return NULL; + + if ( tmh_page_list_pages ) + { + if ( (pi = tmh_page_list_get()) != NULL ) + { + if ( donate_page(d,pi,0) == 0 ) + goto out; + else + tmh_page_list_put(pi); + } + } + + pi = alloc_domheap_pages(d,0,MEMF_tmem); + +out: + ASSERT((pi == NULL) || IS_VALID_PAGE(pi)); + return pi; +} +#define tmh_alloc_page_thispool(_pool) \ + _tmh_alloc_page_thispool(_pool->client->tmh->domain) + +static inline void _tmh_free_page_thispool(struct page_info *pi) +{ + struct domain *d = page_get_owner(pi); + + ASSERT(IS_VALID_PAGE(pi)); + if ( (d == NULL) || steal_page(d,pi,0) == 0 ) + tmh_page_list_put(pi); + else + { + scrub_one_page(pi); + ASSERT((pi->count_info & ~(PGC_allocated | 1)) == 0); + free_domheap_pages(pi,0); + } +} +#define tmh_free_page_thispool(_pool,_pg) \ + _tmh_free_page_thispool(_pg) + +/* + * Memory allocation for ephemeral (non-persistent) data + */ + +static inline void *tmh_alloc_subpage(void *pool, size_t size, + size_t align) +{ +#ifdef __i386__ + ASSERT( size < PAGE_SIZE ); + return _xmalloc(size, align); +#else + ASSERT( size < tmh_mempool_maxalloc ); + ASSERT( tmh_mempool != NULL ); + return xmem_pool_alloc(size, tmh_mempool); +#endif +} + +static inline void tmh_free_subpage(void *ptr, size_t size) +{ +#ifdef __i386__ + ASSERT( size < PAGE_SIZE ); + xfree(ptr); +#else + ASSERT( size < tmh_mempool_maxalloc ); + xmem_pool_free(ptr,tmh_mempool); +#endif +} + +static inline struct page_info *tmh_alloc_page(void *pool, int no_heap) +{ + struct page_info *pi = tmh_page_list_get(); + + if ( pi == NULL && !no_heap ) + pi = alloc_domheap_pages(0,0,MEMF_tmem); + ASSERT((pi == NULL) || IS_VALID_PAGE(pi)); + return pi; +} + +static inline void tmh_free_page(struct page_info *pi) +{ + ASSERT(IS_VALID_PAGE(pi)); + tmh_page_list_put(pi); +} + +static inline unsigned int tmem_subpage_maxsize(void) +{ + return tmh_mempool_maxalloc; +} + +#define tmh_lock_all opt_tmem_lock +#define tmh_flush_dups opt_tmem_flush_dups +#define tmh_called_from_tmem(_memflags) (_memflags & MEMF_tmem) + +/* "Client" (==domain) abstraction */ + +struct client; +typedef domid_t cli_id_t; +typedef struct domain tmh_cli_ptr_t; +typedef struct page_info pfp_t; + +/* this appears to be unreliable when a domain is being shut down */ +static inline struct client *tmh_client_from_cli_id(cli_id_t cli_id) +{ + struct domain *d = get_domain_by_id(cli_id); + if (d == NULL) + return NULL; + return (struct client *)(d->tmem); +} + +static inline struct client *tmh_client_from_current(void) +{ + return (struct client *)(current->domain->tmem); +} + +static inline cli_id_t tmh_get_cli_id_from_current(void) +{ + return current->domain->domain_id; +} + +static inline tmh_cli_ptr_t *tmh_get_cli_ptr_from_current(void) +{ + return current->domain; +} + +static inline void tmh_set_current_client(struct client *client) +{ + current->domain->tmem = client; +} + +static inline bool_t tmh_current_is_privileged(void) +{ + return IS_PRIV(current->domain); +} + +/* these typedefs are in the public/tmem.h interface +typedef XEN_GUEST_HANDLE(void) cli_mfn_t; +typedef XEN_GUEST_HANDLE(char) cli_va_t; +typedef XEN_GUEST_HANDLE(tmem_op_t) cli_tmemop_t; +*/ + +static inline int tmh_get_tmemop_from_client(tmem_op_t *op, tmem_cli_op_t uops) +{ + return __copy_from_guest(op, uops, 1); +} + +static inline void tmh_copy_to_client_buf_offset(tmem_cli_va_t clibuf, int off, + char *tmembuf, int len) +{ + copy_to_guest_offset(clibuf,off,tmembuf,len); +} + +#define TMH_CLI_ID_NULL ((cli_id_t)((domid_t)-1L)) + +#define tmh_cli_id_str "domid" +#define tmh_client_str "domain" + +extern int tmh_decompress_to_client(tmem_cli_mfn_t,void*,size_t); + +extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *); + +extern int tmh_copy_from_client(pfp_t *pfp, + tmem_cli_mfn_t cmfn, uint32_t tmem_offset, + uint32_t pfn_offset, uint32_t len); + +extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, + uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len); + + +#define TMEM_PERF +#ifdef TMEM_PERF +#define DECL_CYC_COUNTER(x) \ + uint64_t x##_sum_cycles = 0, x##_count = 0; \ + uint32_t x##_min_cycles = 0x7fffffff, x##_max_cycles = 0; +#define EXTERN_CYC_COUNTER(x) \ + extern uint64_t x##_sum_cycles, x##_count; \ + extern uint32_t x##_min_cycles, x##_max_cycles; +#define DECL_LOCAL_CYC_COUNTER(x) \ + int64_t x##_start = 0 +#define START_CYC_COUNTER(x) x##_start = get_cycles() +#define DUP_START_CYC_COUNTER(x,y) x##_start = y##_start +/* following might race, but since its advisory only, don't care */ +#define END_CYC_COUNTER(x) \ + do { \ + x##_start = get_cycles() - x##_start; \ + if (x##_start > 0 && x##_start < 1000000000) { \ + x##_sum_cycles += x##_start; x##_count++; \ + if ((uint32_t)x##_start < x##_min_cycles) x##_min_cycles = x##_start; \ + if ((uint32_t)x##_start > x##_max_cycles) x##_max_cycles = x##_start; \ + } \ + } while (0) +#define RESET_CYC_COUNTER(x) { x##_sum_cycles = 0, x##_count = 0; \ + x##_min_cycles = 0x7fffffff, x##_max_cycles = 0; } +#define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) \ + scnprintf(buf,size, \ + tag"n:%"PRIu64","tag"t:%"PRIu64","tag"x:%"PRId32","tag"m:%"PRId32",", \ + x##_count,x##_sum_cycles,x##_max_cycles,x##_min_cycles) +#else +#define DECL_CYC_COUNTER(x) +#define EXTERN_CYC_COUNTER(x) \ + extern uint64_t x##_sum_cycles, x##_count; \ + extern uint32_t x##_min_cycles, x##_max_cycles; +#define DECL_LOCAL_CYC_COUNTER(x) do { } while (0) +#define START_CYC_COUNTER(x) do { } while (0) +#define DUP_START_CYC_COUNTER(x) do { } while (0) +#define END_CYC_COUNTER(x) do { } while (0) +#define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) (0) +#define RESET_CYC_COUNTER(x) do { } while (0) +#endif + +#endif /* __XEN_TMEM_XEN_H__ */ diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h index e41cc36e74..a5188e8629 100644 --- a/xen/include/xen/xmalloc.h +++ b/xen/include/xen/xmalloc.h @@ -76,7 +76,13 @@ void xmem_pool_destroy(struct xmem_pool *pool); void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool); /** - * xmem_pool_free - free memory from given pool + * xmem_pool_maxalloc - xmem_pool_alloc's greater than this size will fail + * @mem_pool: pool + */ +int xmem_pool_maxalloc(struct xmem_pool *pool); + +/** + * xmem_pool_maxsize - * @ptr: address of memory to be freed * @mem_pool: pool to free from */ diff --git a/xen/include/xlat.lst b/xen/include/xlat.lst index f2e4597648..0dfd7c75e1 100644 --- a/xen/include/xlat.lst +++ b/xen/include/xlat.lst @@ -74,3 +74,6 @@ ? processor_px platform.h ! psd_package platform.h ! processor_performance platform.h +# ? tmem_op_t tmem.h +# ? tmem_cli_mfn_t tmem.h +# ? tmem_cli_va_t tmem.h |