aboutsummaryrefslogtreecommitdiffstats
path: root/xen/common/tmem_xen.c
diff options
context:
space:
mode:
authorKeir Fraser <keir.fraser@citrix.com>2009-05-26 11:05:04 +0100
committerKeir Fraser <keir.fraser@citrix.com>2009-05-26 11:05:04 +0100
commit6009f4ddb2cdb8555d2d5e030d351893e971b995 (patch)
tree6f146a530b5065a1688aa456280f965e1751f2c8 /xen/common/tmem_xen.c
parentff811c2bc429a70798cf65913549c0ddaab70c3d (diff)
downloadxen-6009f4ddb2cdb8555d2d5e030d351893e971b995.tar.gz
xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.tar.bz2
xen-6009f4ddb2cdb8555d2d5e030d351893e971b995.zip
Transcendent memory ("tmem") for Xen.
Tmem, when called from a tmem-capable (paravirtualized) guest, makes use of otherwise unutilized ("fallow") memory to create and manage pools of pages that can be accessed from the guest either as "ephemeral" pages or as "persistent" pages. In either case, the pages are not directly addressible by the guest, only copied to and fro via the tmem interface. Ephemeral pages are a nice place for a guest to put recently evicted clean pages that it might need again; these pages can be reclaimed synchronously by Xen for other guests or other uses. Persistent pages are a nice place for a guest to put "swap" pages to avoid sending them to disk. These pages retain data as long as the guest lives, but count against the guest memory allocation. Tmem pages may optionally be compressed and, in certain cases, can be shared between guests. Tmem also handles concurrency nicely and provides limited QoS settings to combat malicious DoS attempts. Save/restore and live migration support is not yet provided. Tmem is primarily targeted for an x86 64-bit hypervisor. On a 32-bit x86 hypervisor, it has limited functionality and testing due to limitations of the xen heap. Nearly all of tmem is architecture-independent; three routines remain to be ported to ia64 and it should work on that architecture too. It is also structured to be portable to non-Xen environments. Tmem defaults off (for now) and must be enabled with a "tmem" xen boot option (and does nothing unless a tmem-capable guest is running). The "tmem_compress" boot option enables compression which takes about 10x more CPU but approximately doubles the number of pages that can be stored. Tmem can be controlled via several "xm" commands and many interesting tmem statistics can be obtained. A README and internal specification will follow, but lots of useful prose about tmem, as well as Linux patches, can be found at http://oss.oracle.com/projects/tmem . Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Diffstat (limited to 'xen/common/tmem_xen.c')
-rw-r--r--xen/common/tmem_xen.c334
1 files changed, 334 insertions, 0 deletions
diff --git a/xen/common/tmem_xen.c b/xen/common/tmem_xen.c
new file mode 100644
index 0000000000..6a0b14f456
--- /dev/null
+++ b/xen/common/tmem_xen.c
@@ -0,0 +1,334 @@
+/******************************************************************************
+ * tmem-xen.c
+ *
+ * Xen-specific Transcendent memory
+ *
+ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
+ */
+
+#include <xen/tmem.h>
+#include <xen/tmem_xen.h>
+#include <xen/lzo.h> /* compression code */
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+
+#define EXPORT /* indicates code other modules are dependent upon */
+
+EXPORT int opt_tmem = 0;
+boolean_param("tmem", opt_tmem);
+
+EXPORT int opt_tmem_compress = 0;
+boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_lock = 0;
+integer_param("tmem_lock", opt_tmem_lock);
+
+#ifdef COMPARE_COPY_PAGE_SSE2
+DECL_CYC_COUNTER(pg_copy1);
+DECL_CYC_COUNTER(pg_copy2);
+DECL_CYC_COUNTER(pg_copy3);
+DECL_CYC_COUNTER(pg_copy4);
+#else
+DECL_CYC_COUNTER(pg_copy);
+#endif
+
+/* these are a concurrency bottleneck, could be percpu and dynamically
+ * allocated iff opt_tmem_compress */
+#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
+#define LZO_DSTMEM_PAGES 2
+static DEFINE_PER_CPU(unsigned char *, workmem);
+static DEFINE_PER_CPU(unsigned char *, dstmem);
+
+#ifdef COMPARE_COPY_PAGE_SSE2
+#include <asm/flushtlb.h> /* REMOVE ME AFTER TEST */
+#include <asm/page.h> /* REMOVE ME AFTER TEST */
+#endif
+void tmh_copy_page(char *to, char*from)
+{
+#ifdef COMPARE_COPY_PAGE_SSE2
+ DECL_LOCAL_CYC_COUNTER(pg_copy1);
+ DECL_LOCAL_CYC_COUNTER(pg_copy2);
+ DECL_LOCAL_CYC_COUNTER(pg_copy3);
+ DECL_LOCAL_CYC_COUNTER(pg_copy4);
+ *to = *from; /* don't measure TLB misses */
+ flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
+ flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
+ START_CYC_COUNTER(pg_copy1);
+ copy_page_sse2(to, from); /* cold cache */
+ END_CYC_COUNTER(pg_copy1);
+ START_CYC_COUNTER(pg_copy2);
+ copy_page_sse2(to, from); /* hot cache */
+ END_CYC_COUNTER(pg_copy2);
+ flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
+ flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
+ START_CYC_COUNTER(pg_copy3);
+ memcpy(to, from, PAGE_SIZE); /* cold cache */
+ END_CYC_COUNTER(pg_copy3);
+ START_CYC_COUNTER(pg_copy4);
+ memcpy(to, from, PAGE_SIZE); /* hot cache */
+ END_CYC_COUNTER(pg_copy4);
+#else
+ DECL_LOCAL_CYC_COUNTER(pg_copy);
+ START_CYC_COUNTER(pg_copy);
+ memcpy(to, from, PAGE_SIZE);
+ END_CYC_COUNTER(pg_copy);
+#endif
+}
+
+#ifdef __ia64__
+static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
+{
+ ASSERT(0);
+}
+#define paging_mark_dirty(_x,_y) do {} while(0)
+#else
+static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
+{
+ unsigned long cli_mfn;
+ p2m_type_t t;
+
+
+ if (is_pv_32on64_vcpu(current))
+ cmfn.p = (void *)((unsigned long)cmfn.p & 0xffffffffUL);
+ cli_mfn = mfn_x(gfn_to_mfn(current->domain,(unsigned long)cmfn.p,&t));
+ if (t != p2m_ram_rw)
+ return NULL;
+ if (pcli_mfn != NULL)
+ *pcli_mfn = cli_mfn;
+ return map_domain_page(cli_mfn);
+}
+#endif
+
+EXPORT int tmh_copy_from_client(pfp_t *pfp,
+ tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
+ uint32_t pfn_offset, uint32_t len)
+{
+ unsigned long tmem_mfn;
+ void *tmem_va, *cli_va = NULL;
+
+ ASSERT(pfp != NULL);
+ if ( tmem_offset || pfn_offset || len )
+ if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+ return -EFAULT;
+ tmem_mfn = page_to_mfn(pfp);
+ tmem_va = map_domain_page(tmem_mfn);
+ mb();
+ if (!len && !tmem_offset && !pfn_offset)
+ memset(tmem_va, 0, PAGE_SIZE);
+ else if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
+ tmh_copy_page(tmem_va, cli_va);
+ else if ( (tmem_offset+len <= PAGE_SIZE) &&
+ (pfn_offset+len <= PAGE_SIZE) )
+ memcpy((char *)tmem_va+tmem_offset,(char *)cli_va+pfn_offset,len);
+ unmap_domain_page(cli_va);
+ unmap_domain_page(tmem_va);
+ return 1;
+}
+
+EXPORT int tmh_compress_from_client(tmem_cli_mfn_t cmfn,
+ void **out_va, size_t *out_len)
+{
+ void *cli_va;
+ int ret = 0;
+ unsigned char *dmem = this_cpu(dstmem);
+ unsigned char *wmem = this_cpu(workmem);
+
+ if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+ return -EFAULT;
+ if ( dmem == NULL || wmem == NULL )
+ return 0; /* no buffer, so can't compress */
+ mb();
+ ret = lzo1x_1_compress(cli_va, PAGE_SIZE, dmem, out_len, wmem);
+ ASSERT(ret == LZO_E_OK);
+ *out_va = dmem;
+ unmap_domain_page(cli_va);
+ return 1;
+}
+
+EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
+ uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
+{
+ unsigned long tmem_mfn, cli_mfn;
+ void *tmem_va, *cli_va;
+
+ ASSERT(pfp != NULL);
+ if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+ return -EFAULT;
+ tmem_mfn = page_to_mfn(pfp);
+ tmem_va = map_domain_page(tmem_mfn);
+ if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
+ tmh_copy_page(cli_va, tmem_va);
+ else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) )
+ memcpy((char *)cli_va+pfn_offset,(char *)tmem_va+tmem_offset,len);
+ unmap_domain_page(tmem_va);
+ unmap_domain_page(cli_va);
+ paging_mark_dirty(current->domain,cli_mfn);
+ mb();
+ return 1;
+}
+
+EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, size_t size)
+{
+ unsigned long cli_mfn;
+ void *cli_va;
+ size_t out_len = PAGE_SIZE;
+ int ret;
+
+ if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+ return -EFAULT;
+ ret = lzo1x_decompress_safe(tmem_va, size, cli_va, &out_len);
+ ASSERT(ret == LZO_E_OK);
+ ASSERT(out_len == PAGE_SIZE);
+ unmap_domain_page(cli_va);
+ paging_mark_dirty(current->domain,cli_mfn);
+ mb();
+ return 1;
+}
+
+/****************** XEN-SPECIFIC MEMORY ALLOCATION ********************/
+
+EXPORT struct xmem_pool *tmh_mempool = 0;
+EXPORT unsigned int tmh_mempool_maxalloc = 0;
+
+EXPORT DEFINE_SPINLOCK(tmh_page_list_lock);
+EXPORT PAGE_LIST_HEAD(tmh_page_list);
+EXPORT unsigned long tmh_page_list_pages = 0;
+
+/* free anything on tmh_page_list to Xen's scrub list */
+EXPORT void tmh_release_avail_pages_to_host(void)
+{
+ spin_lock(&tmh_page_list_lock);
+ if ( !page_list_empty(&tmh_page_list) )
+ {
+ scrub_list_splice(&tmh_page_list);
+ INIT_PAGE_LIST_HEAD(&tmh_page_list);
+ }
+ spin_unlock(&tmh_page_list_lock);
+}
+
+EXPORT void tmh_scrub_page(struct page_info *pi, unsigned int memflags)
+{
+ if ( pi == NULL )
+ return;
+ if ( !(memflags & MEMF_tmem) )
+ scrub_one_page(pi);
+}
+
+#ifndef __i386__
+static noinline void *tmh_mempool_page_get(unsigned long size)
+{
+ struct page_info *pi;
+
+ ASSERT(size == PAGE_SIZE);
+ if ( (pi = tmh_alloc_page(NULL,0)) == NULL )
+ return NULL;
+ ASSERT(IS_VALID_PAGE(pi));
+ return page_to_virt(pi);
+}
+
+static void tmh_mempool_page_put(void *page_va)
+{
+ ASSERT(IS_PAGE_ALIGNED(page_va));
+ tmh_free_page(virt_to_page(page_va));
+}
+
+static int tmh_mempool_init(void)
+{
+ tmh_mempool = xmem_pool_create("tmem", tmh_mempool_page_get,
+ tmh_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
+ if ( tmh_mempool )
+ tmh_mempool_maxalloc = xmem_pool_maxalloc(tmh_mempool);
+ return tmh_mempool != NULL;
+}
+
+/* persistent pools are per-domain */
+
+static void *tmh_persistent_pool_page_get(unsigned long size)
+{
+ struct page_info *pi;
+ struct domain *d = current->domain;
+
+ ASSERT(size == PAGE_SIZE);
+ if ( (pi = _tmh_alloc_page_thispool(d)) == NULL )
+ return NULL;
+ ASSERT(IS_VALID_PAGE(pi));
+ return map_domain_page(page_to_mfn(pi));
+}
+
+static void tmh_persistent_pool_page_put(void *page_va)
+{
+ struct page_info *pi;
+
+ ASSERT(IS_PAGE_ALIGNED(page_va));
+ pi = virt_to_page(page_va);
+ ASSERT(IS_VALID_PAGE(pi));
+ _tmh_free_page_thispool(pi);
+}
+#endif
+
+/****************** XEN-SPECIFIC CLIENT HANDLING ********************/
+
+EXPORT tmh_client_t *tmh_client_init(void)
+{
+ tmh_client_t *tmh;
+ char name[5];
+ domid_t domid = current->domain->domain_id;
+ int i, shift;
+
+ if ( (tmh = xmalloc(tmh_client_t)) == NULL )
+ return NULL;
+ for (i = 0, shift = 12; i < 4; shift -=4, i++)
+ name[i] = ((unsigned short)domid >> shift) & 0xf;
+ name[4] = '\0';
+#ifndef __i386__
+ tmh->persistent_pool = xmem_pool_create(name, tmh_persistent_pool_page_get,
+ tmh_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
+ if ( tmh->persistent_pool == NULL )
+ {
+ xfree(tmh);
+ return NULL;
+ }
+#endif
+ tmh->domain = current->domain;
+ return tmh;
+}
+
+EXPORT void tmh_client_destroy(tmh_client_t *tmh)
+{
+#ifndef __i386__
+ xmem_pool_destroy(tmh->persistent_pool);
+#endif
+ xfree(tmh);
+}
+
+/****************** XEN-SPECIFIC HOST INITIALIZATION ********************/
+
+EXPORT int tmh_init(void)
+{
+#ifndef __i386__
+ int dstmem_order, workmem_order;
+ bool_t bad_alloc = 0;
+ struct page_info *pi;
+ unsigned char *p1, *p2;
+ int cpu;
+
+ if ( !tmh_mempool_init() )
+ return 0;
+
+ dstmem_order = get_order_from_pages(LZO_DSTMEM_PAGES);
+ workmem_order = get_order_from_bytes(LZO1X_1_MEM_COMPRESS);
+ for_each_cpu ( cpu )
+ {
+ pi = alloc_domheap_pages(0,dstmem_order,0);
+ per_cpu(dstmem, cpu) = p1 = ((pi == NULL) ? NULL : page_to_virt(pi));
+ pi = alloc_domheap_pages(0,workmem_order,0);
+ per_cpu(workmem, cpu) = p2 = ((pi == NULL) ? NULL : page_to_virt(pi));
+ if ( (p1 == NULL) || (p2 == NULL) )
+ bad_alloc++;
+ }
+ if ( bad_alloc )
+ printk("tmem: can't allocate compression buffers for %d cpus\n",
+ bad_alloc);
+#endif
+ return 1;
+}