diff options
Diffstat (limited to 'tools')
-rw-r--r-- | tools/Makefile | 4 | ||||
-rwxr-xr-x | tools/examples/xc_dom_control.py | 4 | ||||
-rwxr-xr-x | tools/examples/xc_dom_create.py | 123 | ||||
-rw-r--r-- | tools/xc/lib/Makefile | 2 | ||||
-rw-r--r-- | tools/xc/lib/xc_domain.c | 1 | ||||
-rw-r--r-- | tools/xc/lib/xc_linux_build.c | 25 | ||||
-rw-r--r-- | tools/xc/lib/xc_linux_restore.c | 242 | ||||
-rw-r--r-- | tools/xc/lib/xc_linux_save.c | 392 | ||||
-rw-r--r-- | tools/xc/lib/xc_netbsd_build.c | 26 | ||||
-rw-r--r-- | tools/xc/lib/xc_private.c | 228 | ||||
-rw-r--r-- | tools/xc/lib/xc_private.h | 100 | ||||
-rw-r--r-- | tools/xend/lib/blkif.py | 143 | ||||
-rw-r--r-- | tools/xend/lib/console.py | 83 | ||||
-rw-r--r-- | tools/xend/lib/domain_controller.h | 124 | ||||
-rwxr-xr-x | tools/xend/lib/main.py | 179 | ||||
-rw-r--r-- | tools/xend/lib/manager.py | 89 | ||||
-rw-r--r-- | tools/xend/lib/utils.c | 4 |
17 files changed, 1350 insertions, 419 deletions
diff --git a/tools/Makefile b/tools/Makefile index 0d4c43fb24..9ddf5f25a2 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -17,6 +17,10 @@ install: all $(MAKE) -C xenctl install $(MAKE) -C xend install +dist: $(TARGET) + $(MAKE) prefix=`pwd`/../../install dist=yes install + + clean: $(MAKE) -C balloon clean $(MAKE) -C xc clean diff --git a/tools/examples/xc_dom_control.py b/tools/examples/xc_dom_control.py index ea97a45f0e..4f0bd5de52 100755 --- a/tools/examples/xc_dom_control.py +++ b/tools/examples/xc_dom_control.py @@ -137,6 +137,10 @@ elif cmd == 'suspend': os.kill(pid, signal.SIGTERM) xc.domain_stop( dom=dom ) + + while not xc.domain_getinfo( first_dom=dom, max_doms=1 )[0]['stopped']: + time.sleep(0.1); + rc = xc.linux_save( dom=dom, state_file=file, progress=1) if rc == 0 : xc.domain_destroy( dom=dom, force=1 ) diff --git a/tools/examples/xc_dom_create.py b/tools/examples/xc_dom_create.py index e803737ef0..0a66613da4 100755 --- a/tools/examples/xc_dom_create.py +++ b/tools/examples/xc_dom_create.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import string, sys, os, time, socket, getopt, signal, syslog -import Xc, xenctl.utils, xenctl.console_client +import Xc, xenctl.utils, xenctl.console_client, re config_dir = '/etc/xc/' config_file = xc_config_file = config_dir + 'defaults' @@ -195,6 +195,15 @@ output('VM cmdline : "%s"' % cmdline) if dryrun: sys.exit(1) +##### HACK HACK HACK +##### Until everyone moves to the new I/O world, and a more robust domain +##### controller (xend), we use this little trick to discover whether we +##### are in a testing environment for new I/O stuff. +new_io_world = True +for line in os.popen('cat /proc/interrupts').readlines(): + if re.search('blkdev', line): + new_io_world = False + ##### Code beyond this point is actually used to manage the mechanics of ##### starting (and watching if necessary) guest virtual machines. @@ -228,19 +237,19 @@ def make_domain(): cmsg = 'new_control_interface(dom='+str(id)+', console_port='+str(console_port)+')' - xend_response = xenctl.utils.xend_control_message(cmsg) + cons_response = xenctl.utils.xend_control_message(cmsg) - if not xend_response['success']: + if not cons_response['success']: print "Error creating initial event channel" - print "Error type: " + xend_response['error_type'] - if xend_response['error_type'] == 'exception': - print "Exception type: " + xend_response['exception_type'] - print "Exception value: " + xend_response['exception_value'] + print "Error type: " + cons_response['error_type'] + if cons_response['error_type'] == 'exception': + print "Exception type: " + cons_response['exception_type'] + print "Exception value: " + cons_response['exception_value'] xc.domain_destroy ( dom=id ) sys.exit() if restore: - ret = eval('xc.%s_restore ( dom=id, state_file=state_file, progress=1 )' % builder_fn ) + ret = eval('xc.%s_restore ( dom=id, state_file=state_file, progress=1)' % (builder_fn) ) if ret < 0: print "Error restoring domain" print "Return code = " + str(ret) @@ -248,7 +257,7 @@ def make_domain(): sys.exit() else: - ret = eval('xc.%s_build ( dom=id, image=image, ramdisk=ramdisk, cmdline=cmdline, control_evtchn=xend_response["remote_port"] )' % builder_fn ) + ret = eval('xc.%s_build ( dom=id, image=image, ramdisk=ramdisk, cmdline=cmdline, control_evtchn=cons_response["remote_port"] )' % builder_fn ) if ret < 0: print "Error building Linux guest OS: " print "Return code = " + str(ret) @@ -259,6 +268,18 @@ def make_domain(): # set the expertise level appropriately xenctl.utils.VBD_EXPERT_MODE = vbd_expert + + if new_io_world: + cmsg = 'new_block_interface(dom='+str(id)+')' + xend_response = xenctl.utils.xend_control_message(cmsg) + if not xend_response['success']: + print "Error creating block interface" + print "Error type: " + xend_response['error_type'] + if xend_response['error_type'] == 'exception': + print "Exception type: " + xend_response['exception_type'] + print "Exception val: " + xend_response['exception_value'] + xc.domain_destroy ( dom=id ) + sys.exit() for ( uname, virt_name, rw ) in vbd_list: virt_dev = xenctl.utils.blkdev_name_to_number( virt_name ) @@ -269,42 +290,70 @@ def make_domain(): xc.domain_destroy ( dom=id ) sys.exit() - # check that setting up this VBD won't violate the sharing - # allowed by the current VBD expertise level - if xenctl.utils.vd_extents_validate(segments, rw=='w' or rw=='rw') < 0: - xc.domain_destroy( dom = id ) - sys.exit() + if new_io_world: + if len(segments) > 1: + print "New I/O world cannot deal with multi-extent vdisks" + xc.domain_destroy ( dom=id ) + sys.exit() + seg = segments[0] + cmsg = 'new_block_device(dom=' + str(id) + \ + ',handle=0,vdev=' + str(virt_dev) + \ + ',pdev=' + str(seg['device']) + \ + ',start_sect=' + str(seg['start_sector']) + \ + ',nr_sect=' + str(seg['nr_sectors']) + \ + ',readonly=' + str(not re.match('w',rw)) + ')' + xend_response = xenctl.utils.xend_control_message(cmsg) + if not xend_response['success']: + print "Error creating virtual block device" + print "Error type: " + xend_response['error_type'] + if xend_response['error_type'] == 'exception': + print "Exception type: " + xend_response['exception_type'] + print "Exception val: " + xend_response['exception_value'] + xc.domain_destroy ( dom=id ) + sys.exit() + else: + # check that setting up this VBD won't violate the sharing + # allowed by the current VBD expertise level + if xenctl.utils.vd_extents_validate(segments, + rw=='w' or rw=='rw') < 0: + xc.domain_destroy( dom = id ) + sys.exit() - if xc.vbd_create( dom=id, vbd=virt_dev, writeable= rw=='w' or rw=='rw' ): - print "Error creating VBD vbd=%d writeable=%d\n" % (virt_dev,rw) - xc.domain_destroy ( dom=id ) - sys.exit() + if xc.vbd_create( dom=id, vbd=virt_dev, + writeable= rw=='w' or rw=='rw' ): + print "Error creating VBD %d (writeable=%d)\n" % (virt_dev,rw) + xc.domain_destroy ( dom=id ) + sys.exit() - if xc.vbd_setextents( dom=id, - vbd=virt_dev, - extents=segments): - print "Error populating VBD vbd=%d\n" % virt_dev - xc.domain_destroy ( dom=id ) - sys.exit() - - # setup virtual firewall rules for all aliases - for ip in vfr_ipaddr: - xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip ) - - # check for physical device access - for (pci_bus, pci_dev, pci_func) in pci_device_list: - if xc.physdev_pci_access_modify( - dom=id, bus=pci_bus, dev=pci_dev, func=pci_func, enable=1 ) < 0: - print "Non-fatal error enabling PCI device access." - else: - print "Enabled PCI access (%d:%d:%d)." % (pci_bus,pci_dev,pci_func) + if xc.vbd_setextents( dom=id, + vbd=virt_dev, + extents=segments): + print "Error populating VBD vbd=%d\n" % virt_dev + xc.domain_destroy ( dom=id ) + sys.exit() + + if not new_io_world: + # setup virtual firewall rules for all aliases + for ip in vfr_ipaddr: + xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip ) + + if new_io_world: + # check for physical device access + for (pci_bus, pci_dev, pci_func) in pci_device_list: + if xc.physdev_pci_access_modify( + dom=id, bus=pci_bus, dev=pci_dev, + func=pci_func, enable=1 ) < 0: + print "Non-fatal error enabling PCI device access." + else: + print "Enabled PCI access (%d:%d:%d)." % \ + (pci_bus,pci_dev,pci_func) if xc.domain_start( dom=id ) < 0: print "Error starting domain" xc.domain_destroy ( dom=id ) sys.exit() - return (id, xend_response['console_port']) + return (id, cons_response['console_port']) # end of make_domain() def mkpidfile(): diff --git a/tools/xc/lib/Makefile b/tools/xc/lib/Makefile index 79dce046df..f542935167 100644 --- a/tools/xc/lib/Makefile +++ b/tools/xc/lib/Makefile @@ -4,7 +4,7 @@ MINOR = 0 SONAME = libxc.so.$(MAJOR) CC = gcc -CFLAGS = -c -Wall -O3 -fno-strict-aliasing +CFLAGS = -c -Werror -O3 -fno-strict-aliasing CFLAGS += -I../../../xen/include/hypervisor-ifs CFLAGS += -I../../xend/lib CFLAGS += -I../../../xenolinux-sparse/include diff --git a/tools/xc/lib/xc_domain.c b/tools/xc/lib/xc_domain.c index ec28f2686b..1d77bfc016 100644 --- a/tools/xc/lib/xc_domain.c +++ b/tools/xc/lib/xc_domain.c @@ -84,6 +84,7 @@ int xc_domain_getinfo(int xc_handle, { op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)next_domid; + op.u.getdomaininfo.ctxt = NULL; // no exec context info, thanks. if ( do_dom0_op(xc_handle, &op) < 0 ) break; info->domid = (u64)op.u.getdomaininfo.domain; diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c index f1bd182827..83debd904d 100644 --- a/tools/xc/lib/xc_linux_build.c +++ b/tools/xc/lib/xc_linux_build.c @@ -26,6 +26,7 @@ static long get_tot_pages(int xc_handle, u64 domid) dom0_op_t op; op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)domid; + op.u.getdomaininfo.ctxt = NULL; return (do_dom0_op(xc_handle, &op) < 0) ? -1 : op.u.getdomaininfo.tot_pages; } @@ -70,7 +71,7 @@ static int setup_guestos(int xc_handle, gzFile initrd_gfd, unsigned long initrd_len, unsigned long nr_pages, unsigned long *pvsi, unsigned long *pvke, - dom0_builddomain_t *builddomain, + full_execution_context_t *ctxt, const char *cmdline, unsigned long shared_info_frame, unsigned int control_evtchn) @@ -163,8 +164,6 @@ static int setup_guestos(int xc_handle, v_start, v_end); printf(" ENTRY ADDRESS: %08lx\n", vkern_entry); - memset(builddomain, 0, sizeof(*builddomain)); - if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) goto error_out; @@ -205,7 +204,7 @@ static int setup_guestos(int xc_handle, /* First allocate page for page dir. */ ppt_alloc = (vpt_start - v_start) >> PAGE_SHIFT; l2tab = page_array[ppt_alloc++] << PAGE_SHIFT; - builddomain->ctxt.pt_base = l2tab; + ctxt->pt_base = l2tab; /* Initialise the page tables. */ if ( (vl2tab = map_pfn_writeable(pm_handle, l2tab >> PAGE_SHIFT)) == NULL ) @@ -388,7 +387,7 @@ int xc_linux_build(int xc_handle, int initrd_fd = -1; gzFile initrd_gfd = NULL; int rc, i; - full_execution_context_t *ctxt; + full_execution_context_t st_ctxt, *ctxt = &st_ctxt; unsigned long nr_pages; char *image = NULL; unsigned long image_size, initrd_size=0; @@ -420,8 +419,15 @@ int xc_linux_build(int xc_handle, } } + if ( mlock(&st_ctxt, sizeof(st_ctxt) ) ) + { + PERROR("Unable to mlock ctxt"); + return 1; + } + op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)domid; + op.u.getdomaininfo.ctxt = ctxt; if ( (do_dom0_op(xc_handle, &op) < 0) || ((u64)op.u.getdomaininfo.domain != domid) ) { @@ -429,7 +435,7 @@ int xc_linux_build(int xc_handle, goto error_out; } if ( (op.u.getdomaininfo.state != DOMSTATE_STOPPED) || - (op.u.getdomaininfo.ctxt.pt_base != 0) ) + (ctxt->pt_base != 0) ) { ERROR("Domain is already constructed"); goto error_out; @@ -438,7 +444,7 @@ int xc_linux_build(int xc_handle, if ( setup_guestos(xc_handle, domid, image, image_size, initrd_gfd, initrd_size, nr_pages, &vstartinfo_start, &vkern_entry, - &launch_op.u.builddomain, cmdline, + ctxt, cmdline, op.u.getdomaininfo.shared_info_frame, control_evtchn) < 0 ) { @@ -453,8 +459,6 @@ int xc_linux_build(int xc_handle, if ( image != NULL ) free(image); - ctxt = &launch_op.u.builddomain.ctxt; - ctxt->flags = 0; /* @@ -507,8 +511,11 @@ int xc_linux_build(int xc_handle, ctxt->failsafe_callback_cs = FLAT_GUESTOS_CS; ctxt->failsafe_callback_eip = 0; + memset( &launch_op, 0, sizeof(launch_op) ); + launch_op.u.builddomain.domain = (domid_t)domid; launch_op.u.builddomain.num_vifs = 1; + launch_op.u.builddomain.ctxt = ctxt; launch_op.cmd = DOM0_BUILDDOMAIN; rc = do_dom0_op(xc_handle, &launch_op); diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c index 239df65984..e27221281a 100644 --- a/tools/xc/lib/xc_linux_restore.c +++ b/tools/xc/lib/xc_linux_restore.c @@ -10,6 +10,8 @@ #include <asm-xen/suspend.h> #include <zlib.h> +#define MAX_BATCH_SIZE 1024 + /* This may allow us to create a 'quiet' command-line option, if necessary. */ #define verbose_printf(_f, _a...) \ do { \ @@ -58,8 +60,8 @@ int xc_linux_restore(int xc_handle, u64 *pdomid) { dom0_op_t op; - int rc = 1, i, j; - unsigned long mfn, pfn; + int rc = 1, i, j, n, k; + unsigned long mfn, pfn, xpfn; unsigned int prev_pc, this_pc; /* Number of page frames in use by this Linux session. */ @@ -93,6 +95,9 @@ int xc_linux_restore(int xc_handle, /* A temporary mapping of the guest's suspend record. */ suspend_record_t *p_srec; + mfn_mapper_t *region_mapper, *mapper_handle1; + char *region_base; + /* The name and descriptor of the file that we are reading from. */ int fd; gzFile gfd; @@ -114,6 +119,14 @@ int xc_linux_restore(int xc_handle, return 1; } + if ( mlock(&ctxt, sizeof(ctxt) ) ) + { + /* needed for when we do the build dom0 op, + but might as well do early */ + PERROR("Unable to mlock ctxt"); + return 1; + } + /* Start writing out the saved-domain record. */ if ( !checked_read(gfd, signature, 16) || (memcmp(signature, "LinuxGuestRecord", 16) != 0) ) @@ -159,12 +172,6 @@ int xc_linux_restore(int xc_handle, goto out; } - if ( !checked_read(gfd, pfn_type, 4 * nr_pfns) ) - { - ERROR("Error when reading from state file"); - goto out; - } - /* Set the domain's name to that from the restore file */ if ( xc_domain_setname( xc_handle, dom, name ) ) { @@ -184,6 +191,7 @@ int xc_linux_restore(int xc_handle, /* Get the domain's shared-info frame. */ op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)dom; + op.u.getdomaininfo.ctxt = NULL; if ( do_dom0_op(xc_handle, &op) < 0 ) { ERROR("Could not get information on new domain"); @@ -212,6 +220,15 @@ int xc_linux_restore(int xc_handle, goto out; } + + if ( (region_mapper = mfn_mapper_init(xc_handle, dom, + MAX_BATCH_SIZE*PAGE_SIZE, + PROT_WRITE )) + == NULL ) + goto out; + + region_base = mfn_mapper_base( region_mapper ); + verbose_printf("Reloading memory pages: 0%%"); /* @@ -219,75 +236,141 @@ int xc_linux_restore(int xc_handle, * We uncanonicalise page tables as we go. */ prev_pc = 0; - for ( i = 0; i < nr_pfns; i++ ) + + n=0; + while(1) { - this_pc = (i * 100) / nr_pfns; + int j; + unsigned long region_pfn_type[1024]; + + this_pc = (n * 100) / nr_pfns; if ( (this_pc - prev_pc) >= 5 ) { verbose_printf("\b\b\b\b%3d%%", this_pc); prev_pc = this_pc; } - mfn = pfn_to_mfn_table[i]; - - ppage = map_pfn_writeable(pm_handle, mfn); - - if ( !checked_read(gfd, ppage, PAGE_SIZE) ) + if ( !checked_read(gfd, &j, sizeof(int)) ) { ERROR("Error when reading from state file"); goto out; } - if ( pfn_type[i] == L1TAB ) + //printf("batch=%d\n",j); + + if(j==0) break; // our work here is done + + if ( !checked_read(gfd, region_pfn_type, j*sizeof(unsigned long)) ) { - for ( j = 0; j < 1024; j++ ) - { - if ( ppage[j] & _PAGE_PRESENT ) - { - if ( (pfn = ppage[j] >> PAGE_SHIFT) >= nr_pfns ) - { - ERROR("Frame number in page table is out of range"); - goto out; - } - if ( (pfn_type[pfn] != NONE) && (ppage[j] & _PAGE_RW) ) - { - ERROR("Write access requested for a restricted frame"); - goto out; - } - ppage[j] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT); - ppage[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; - } - } - } - else if ( pfn_type[i] == L2TAB ) - { - for ( j = 0; j < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT); j++ ) - { - if ( ppage[j] & _PAGE_PRESENT ) - { - if ( (pfn = ppage[j] >> PAGE_SHIFT) >= nr_pfns ) - { - ERROR("Frame number in page table is out of range"); - goto out; - } - if ( pfn_type[pfn] != L1TAB ) - { - ERROR("Page table mistyping"); - goto out; - } - ppage[j] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE); - ppage[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT; - } - } + ERROR("Error when reading from state file"); + goto out; } - unmap_pfn(pm_handle, ppage); + for(i=0;i<j;i++) + { + pfn = region_pfn_type[i] & ~PGT_type_mask; + mfn = pfn_to_mfn_table[pfn]; + + mfn_mapper_queue_entry( region_mapper, i<<PAGE_SHIFT, + mfn, PAGE_SIZE ); + } + + if( mfn_mapper_flush_queue(region_mapper) ) + { + ERROR("Couldn't map page region"); + goto out; + } + + + for(i=0;i<j;i++) + { + unsigned long *ppage; + + pfn = region_pfn_type[i] & ~PGT_type_mask; + +//if(pfn_type[i])printf("^pfn=%d %08lx\n",pfn,pfn_type[i]); + + if (pfn>nr_pfns) + { + ERROR("pfn out of range"); + goto out; + } + + region_pfn_type[i] &= PGT_type_mask; + + pfn_type[pfn] = region_pfn_type[i]; + + mfn = pfn_to_mfn_table[pfn]; + +//if(region_pfn_type[i])printf("i=%d pfn=%d mfn=%d type=%lx\n",i,pfn,mfn,region_pfn_type[i]); + + ppage = (unsigned long*) (region_base + i*PAGE_SIZE); + + if ( !checked_read(gfd, ppage, PAGE_SIZE) ) + { + ERROR("Error when reading from state file"); + goto out; + } + + if ( region_pfn_type[i] == L1TAB ) + { + for ( k = 0; k < 1024; k++ ) + { + if ( ppage[k] & _PAGE_PRESENT ) + { + if ( (xpfn = ppage[k] >> PAGE_SHIFT) >= nr_pfns ) + { + ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i],i,k,xpfn,nr_pfns); + goto out; + } +#if 0 + if ( (region_pfn_type[xpfn] != NONE) && (ppage[k] & _PAGE_RW) ) + { + ERROR("Write access requested for a restricted frame"); + goto out; + } +#endif + ppage[k] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT); + ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT; + } + } + } + else if ( region_pfn_type[i] == L2TAB ) + { + for ( k = 0; k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT); k++ ) + { + if ( ppage[k] & _PAGE_PRESENT ) + { + if ( (xpfn = ppage[k] >> PAGE_SHIFT) >= nr_pfns ) + { + ERROR("Frame number in page table is out of range"); + goto out; + } +#if 0 + if ( region_pfn_type[pfn] != L1TAB ) + { + ERROR("Page table mistyping"); + goto out; + } +#endif + ppage[k] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE); + ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT; + } + } + } + + if ( add_mmu_update(xc_handle, mmu, + (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) ) + goto out; + + } + + n+=j; // crude stats - if ( add_mmu_update(xc_handle, mmu, - (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i) ) - goto out; } + mfn_mapper_close( region_mapper ); + /* * Pin page tables. Do this after writing to them as otherwise Xen * will barf when doing the type-checking. @@ -352,26 +435,47 @@ int xc_linux_restore(int xc_handle, pfn = ctxt.pt_base >> PAGE_SHIFT; if ( (pfn >= nr_pfns) || (pfn_type[pfn] != L2TAB) ) { - ERROR("PT base is bad"); + printf("PT base is bad. pfn=%d nr=%d type=%08lx %08lx\n", + pfn, nr_pfns, pfn_type[pfn], L2TAB); + ERROR("PT base is bad."); goto out; } ctxt.pt_base = pfn_to_mfn_table[pfn] << PAGE_SHIFT; /* Uncanonicalise the pfn-to-mfn table frame-number list. */ - for ( i = 0; i < nr_pfns; i += 1024 ) + + + if ( (mapper_handle1 = mfn_mapper_init(xc_handle, dom, + 1024*1024, PROT_WRITE )) + == NULL ) + goto out; + + for ( i = 0; i < (nr_pfns+1023)/1024; i++ ) { - unsigned long copy_size = (nr_pfns - i) * sizeof(unsigned long); - if ( copy_size > PAGE_SIZE ) copy_size = PAGE_SIZE; - pfn = pfn_to_mfn_frame_list[i/1024]; + unsigned long pfn, mfn; + + pfn = pfn_to_mfn_frame_list[i]; if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) ) { ERROR("PFN-to-MFN frame number is bad"); goto out; } - ppage = map_pfn_writeable(pm_handle, pfn_to_mfn_table[pfn]); - memcpy(ppage, &pfn_to_mfn_table[i], copy_size); - unmap_pfn(pm_handle, ppage); + mfn = pfn_to_mfn_table[pfn]; + + mfn_mapper_queue_entry( mapper_handle1, i<<PAGE_SHIFT, + mfn, PAGE_SIZE ); } + + if ( mfn_mapper_flush_queue(mapper_handle1) ) + { + ERROR("Couldn't map pfn_to_mfn table"); + goto out; + } + + memcpy( mfn_mapper_base( mapper_handle1 ), pfn_to_mfn_table, + nr_pfns*sizeof(unsigned long) ); + + mfn_mapper_close( mapper_handle1 ); /* * Safety checking of saved context: @@ -406,11 +510,11 @@ int xc_linux_restore(int xc_handle, ERROR("Bad LDT base or size"); goto out; } - + op.cmd = DOM0_BUILDDOMAIN; op.u.builddomain.domain = (domid_t)dom; op.u.builddomain.num_vifs = 1; - memcpy(&op.u.builddomain.ctxt, &ctxt, sizeof(ctxt)); + op.u.builddomain.ctxt = &ctxt; rc = do_dom0_op(xc_handle, &op); out: diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c index dc759f546c..88ed9e15d7 100644 --- a/tools/xc/lib/xc_linux_save.c +++ b/tools/xc/lib/xc_linux_save.c @@ -10,6 +10,8 @@ #include <asm-xen/suspend.h> #include <zlib.h> +#define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */ + /* This may allow us to create a 'quiet' command-line option, if necessary. */ #define verbose_printf(_f, _a...) \ do { \ @@ -24,7 +26,7 @@ */ #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ (((_mfn) < (1024*1024)) && \ - (pfn_to_mfn_table[mfn_to_pfn_table[_mfn]] == (_mfn))) + (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))) /* Returns TRUE if MFN is successfully converted to a PFN. */ #define translate_mfn_to_pfn(_pmfn) \ @@ -34,37 +36,11 @@ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \ _res = 0; \ else \ - *(_pmfn) = mfn_to_pfn_table[mfn]; \ + *(_pmfn) = live_mfn_to_pfn_table[mfn]; \ _res; \ }) -static int check_pfn_ownership(int xc_handle, - unsigned long mfn, - u64 dom) -{ - dom0_op_t op; - op.cmd = DOM0_GETPAGEFRAMEINFO; - op.u.getpageframeinfo.pfn = mfn; - op.u.getpageframeinfo.domain = (domid_t)dom; - return (do_dom0_op(xc_handle, &op) >= 0); -} -#define GETPFN_ERR (~0U) -static unsigned int get_pfn_type(int xc_handle, - unsigned long mfn, - u64 dom) -{ - dom0_op_t op; - op.cmd = DOM0_GETPAGEFRAMEINFO; - op.u.getpageframeinfo.pfn = mfn; - op.u.getpageframeinfo.domain = (domid_t)dom; - if ( do_dom0_op(xc_handle, &op) < 0 ) - { - PERROR("Unexpected failure when getting page frame info!"); - return GETPFN_ERR; - } - return op.u.getpageframeinfo.type; -} static int checked_write(gzFile fd, void *buf, size_t count) { @@ -80,10 +56,13 @@ int xc_linux_save(int xc_handle, int verbose) { dom0_op_t op; - int rc = 1, i, j; + int rc = 1, i, j, k, n; unsigned long mfn; unsigned int prev_pc, this_pc; + /* state of the new MFN mapper */ + mfn_mapper_t *mapper_handle1, *mapper_handle2; + /* Remember if we stopped the guest, so we can restart it on exit. */ int we_stopped_it = 0; @@ -100,18 +79,23 @@ int xc_linux_save(int xc_handle, unsigned long *pfn_type = NULL; /* A temporary mapping, and a copy, of one frame of guest memory. */ - unsigned long *ppage, page[1024]; + unsigned long page[1024]; - /* A temporary mapping, and a copy, of the pfn-to-mfn table frame list. */ - unsigned long *p_pfn_to_mfn_frame_list, pfn_to_mfn_frame_list[1024]; - /* A temporary mapping of one frame in the above list. */ - unsigned long *pfn_to_mfn_frame; + /* A copy of the pfn-to-mfn table frame list. */ + unsigned long *live_pfn_to_mfn_frame_list; + unsigned long pfn_to_mfn_frame_list[1024]; - /* A table mapping each PFN to its current MFN. */ - unsigned long *pfn_to_mfn_table = NULL; - /* A table mapping each current MFN to its canonical PFN. */ - unsigned long *mfn_to_pfn_table = NULL; + /* Live mapping of the table mapping each PFN to its current MFN. */ + unsigned long *live_pfn_to_mfn_table = NULL; + /* Live mapping of system MFN to PFN table. */ + unsigned long *live_mfn_to_pfn_table = NULL; + /* Live mapping of shared info structure */ + unsigned long *live_shinfo; + + /* base of the region in which domain memory is mapped */ + unsigned char *region_base; + /* A temporary mapping, and a copy, of the guest's suspend record. */ suspend_record_t *p_srec, srec; @@ -138,11 +122,18 @@ int xc_linux_save(int xc_handle, return 1; } + if ( mlock(&ctxt, sizeof(ctxt) ) ) + { + PERROR("Unable to mlock ctxt"); + return 1; + } + /* Ensure that the domain exists, and that it is stopped. */ for ( ; ; ) { op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)domid; + op.u.getdomaininfo.ctxt = &ctxt; if ( (do_dom0_op(xc_handle, &op) < 0) || ((u64)op.u.getdomaininfo.domain != domid) ) { @@ -150,7 +141,6 @@ int xc_linux_save(int xc_handle, goto out; } - memcpy(&ctxt, &op.u.getdomaininfo.ctxt, sizeof(ctxt)); memcpy(name, op.u.getdomaininfo.name, sizeof(name)); shared_info_frame = op.u.getdomaininfo.shared_info_frame; @@ -178,99 +168,115 @@ int xc_linux_save(int xc_handle, goto out; } - if ( (pm_handle = init_pfn_mapper((domid_t)domid)) < 0 ) - goto out; - /* Is the suspend-record MFN actually valid for this domain? */ - if ( !check_pfn_ownership(xc_handle, ctxt.cpu_ctxt.esi, domid) ) + /* Map the suspend-record MFN to pin it. The page must be owned by + domid for this to succeed. */ + p_srec = mfn_mapper_map_single(xc_handle, domid, + sizeof(srec), PROT_READ, + ctxt.cpu_ctxt.esi ); + + if (!p_srec) { - ERROR("Invalid state record pointer"); + ERROR("Couldn't map state record"); goto out; } - /* If the suspend-record MFN is okay then grab a copy of it to @srec. */ - p_srec = map_pfn_readonly(pm_handle, ctxt.cpu_ctxt.esi); - memcpy(&srec, p_srec, sizeof(srec)); - unmap_pfn(pm_handle, p_srec); + memcpy( &srec, p_srec, sizeof(srec) ); + /* cheesy sanity check */ if ( srec.nr_pfns > 1024*1024 ) { ERROR("Invalid state record -- pfn count out of range"); goto out; } - if ( !check_pfn_ownership(xc_handle, srec.pfn_to_mfn_frame_list, domid) ) + /* the pfn_to_mfn_frame_list fits in a single page */ + live_pfn_to_mfn_frame_list = + mfn_mapper_map_single(xc_handle, domid, + PAGE_SIZE, PROT_READ, + srec.pfn_to_mfn_frame_list ); + + if (!live_pfn_to_mfn_frame_list) { - ERROR("Invalid pfn-to-mfn frame list pointer"); + ERROR("Couldn't map pfn_to_mfn_frame_list"); goto out; } + + + if ( (mapper_handle1 = mfn_mapper_init(xc_handle, domid, + 1024*1024, PROT_READ )) + == NULL ) + goto out; + + for ( i = 0; i < (srec.nr_pfns+1023)/1024; i++ ) + { + /* Grab a copy of the pfn-to-mfn table frame list. + This has the effect of preventing the page from being freed and + given to another domain. (though the domain is stopped anyway...) */ + mfn_mapper_queue_entry( mapper_handle1, i<<PAGE_SHIFT, + live_pfn_to_mfn_frame_list[i], + PAGE_SIZE ); + } + + if ( mfn_mapper_flush_queue(mapper_handle1) ) + { + ERROR("Couldn't map pfn_to_mfn table"); + goto out; + } + + live_pfn_to_mfn_table = mfn_mapper_base( mapper_handle1 ); + - /* Grab a copy of the pfn-to-mfn table frame list. */ - p_pfn_to_mfn_frame_list = map_pfn_readonly( - pm_handle, srec.pfn_to_mfn_frame_list); - memcpy(pfn_to_mfn_frame_list, p_pfn_to_mfn_frame_list, PAGE_SIZE); - unmap_pfn(pm_handle, p_pfn_to_mfn_frame_list); /* We want zeroed memory so use calloc rather than malloc. */ - mfn_to_pfn_table = calloc(1, 4 * 1024 * 1024); - pfn_to_mfn_table = calloc(1, 4 * srec.nr_pfns); - pfn_type = calloc(1, 4 * srec.nr_pfns); + pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long)); - if ( (mfn_to_pfn_table == NULL) || - (pfn_to_mfn_table == NULL) || - (pfn_type == NULL) ) + if ( (pfn_type == NULL) ) { errno = ENOMEM; goto out; } + if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ) + { + ERROR("Unable to mlock"); + goto out; + } - /* - * Construct the local pfn-to-mfn and mfn-to-pfn tables. On exit from this - * loop we have each MFN mapped at most once. Note that there may be MFNs - * that aren't mapped at all: we detect these by MFN_IS_IN_PSEUDOPHYS_MAP. - */ - pfn_to_mfn_frame = NULL; - for ( i = 0; i < srec.nr_pfns; i++ ) + + /* Track the mfn_to_pfn table down from the domains PT */ { - /* Each frameful of table frames must be checked & mapped on demand. */ - if ( (i & 1023) == 0 ) - { - mfn = pfn_to_mfn_frame_list[i/1024]; - if ( !check_pfn_ownership(xc_handle, mfn, domid) ) - { - ERROR("Invalid frame number if pfn-to-mfn frame list"); - goto out; - } - if ( pfn_to_mfn_frame != NULL ) - unmap_pfn(pm_handle, pfn_to_mfn_frame); - pfn_to_mfn_frame = map_pfn_readonly(pm_handle, mfn); - } - - mfn = pfn_to_mfn_frame[i & 1023]; + unsigned long *pgd; + unsigned long mfn_to_pfn_table_start_mfn; - if ( !check_pfn_ownership(xc_handle, mfn, domid) ) - { - ERROR("Invalid frame specified with pfn-to-mfn table"); - goto out; - } + pgd = mfn_mapper_map_single(xc_handle, domid, + PAGE_SIZE, PROT_READ, + ctxt.pt_base>>PAGE_SHIFT); - /* Did we map this MFN already? That would be invalid! */ - if ( MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) - { - ERROR("A machine frame appears twice in pseudophys space"); - goto out; - } + mfn_to_pfn_table_start_mfn = + pgd[HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT]>>PAGE_SHIFT; + + live_mfn_to_pfn_table = + mfn_mapper_map_single(xc_handle, ~0ULL, + PAGE_SIZE*1024, PROT_READ, + mfn_to_pfn_table_start_mfn ); + } - pfn_to_mfn_table[i] = mfn; - mfn_to_pfn_table[mfn] = i; - /* Query page type by MFN, but store it by PFN. */ - if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn, domid)) == - GETPFN_ERR ) - goto out; + /* + * Quick belt and braces sanity check. + */ + + for ( i = 0; i < srec.nr_pfns; i++ ) + { + mfn = live_pfn_to_mfn_table[i]; + + if( live_mfn_to_pfn_table[mfn] != i ) + printf("i=%d mfn=%d live_mfn_to_pfn_table=%d\n", + i,mfn,live_mfn_to_pfn_table[mfn]); } + /* Canonicalise the suspend-record frame number. */ if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ) { @@ -294,9 +300,10 @@ int xc_linux_save(int xc_handle, ERROR("PT base is not in range of pseudophys map"); goto out; } - ctxt.pt_base = mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT; + ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT; /* Canonicalise the pfn-to-mfn table frame-number list. */ + memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE ); for ( i = 0; i < srec.nr_pfns; i += 1024 ) { if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ) @@ -307,63 +314,152 @@ int xc_linux_save(int xc_handle, } /* Start writing out the saved-domain record. */ - ppage = map_pfn_readonly(pm_handle, shared_info_frame); + live_shinfo = mfn_mapper_map_single(xc_handle, domid, + PAGE_SIZE, PROT_READ, + shared_info_frame); + + if (!live_shinfo) + { + ERROR("Couldn't map live_shinfo"); + goto out; + } + if ( !checked_write(gfd, "LinuxGuestRecord", 16) || !checked_write(gfd, name, sizeof(name)) || !checked_write(gfd, &srec.nr_pfns, sizeof(unsigned long)) || !checked_write(gfd, &ctxt, sizeof(ctxt)) || - !checked_write(gfd, ppage, PAGE_SIZE) || - !checked_write(gfd, pfn_to_mfn_frame_list, PAGE_SIZE) || - !checked_write(gfd, pfn_type, 4 * srec.nr_pfns) ) + !checked_write(gfd, live_shinfo, PAGE_SIZE) || + !checked_write(gfd, pfn_to_mfn_frame_list, PAGE_SIZE) ) { ERROR("Error when writing to state file"); goto out; } - unmap_pfn(pm_handle, ppage); + munmap(live_shinfo, PAGE_SIZE); verbose_printf("Saving memory pages: 0%%"); + if ( (mapper_handle2 = mfn_mapper_init(xc_handle, domid, + BATCH_SIZE*4096, PROT_READ )) + == NULL ) + goto out; + + region_base = mfn_mapper_base( mapper_handle2 ); + /* Now write out each data page, canonicalising page tables as we go... */ prev_pc = 0; - for ( i = 0; i < srec.nr_pfns; i++ ) + for ( n = 0; n < srec.nr_pfns; ) { - this_pc = (i * 100) / srec.nr_pfns; + this_pc = (n * 100) / srec.nr_pfns; if ( (this_pc - prev_pc) >= 5 ) { verbose_printf("\b\b\b\b%3d%%", this_pc); prev_pc = this_pc; } - mfn = pfn_to_mfn_table[i]; - - ppage = map_pfn_readonly(pm_handle, mfn); - memcpy(page, ppage, PAGE_SIZE); - unmap_pfn(pm_handle, ppage); - - if ( (pfn_type[i] == L1TAB) || (pfn_type[i] == L2TAB) ) - { - for ( j = 0; - j < ((pfn_type[i] == L2TAB) ? - (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); - j++ ) - { - if ( !(page[j] & _PAGE_PRESENT) ) continue; - mfn = page[j] >> PAGE_SHIFT; - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) - { - ERROR("Frame number in pagetable page is invalid"); - goto out; - } - page[j] &= PAGE_SIZE - 1; - page[j] |= mfn_to_pfn_table[mfn] << PAGE_SHIFT; - } - } - - if ( !checked_write(gfd, page, PAGE_SIZE) ) - { - ERROR("Error when writing to state file"); - goto out; - } + for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) + { + pfn_type[j] = live_pfn_to_mfn_table[i]; + } + + + for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) + { + /* queue up mappings for all of the pages in this batch */ + +//printf("region n=%d j=%d i=%d mfn=%d\n",n,j,i,live_pfn_to_mfn_table[i]); + mfn_mapper_queue_entry( mapper_handle2, j<<PAGE_SHIFT, + live_pfn_to_mfn_table[i], + PAGE_SIZE ); + } + + if( mfn_mapper_flush_queue(mapper_handle2) ) + { + ERROR("Couldn't map page region"); + goto out; + } + + if ( get_pfn_type_batch(xc_handle, domid, j, pfn_type) ) + { + ERROR("get_pfn_type_batch failed"); + goto out; + } + + for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) + { + if((pfn_type[j]>>29) == 7) + { + ERROR("bogus page"); + goto out; + } + + /* canonicalise mfn->pfn */ + pfn_type[j] = (pfn_type[j] & PGT_type_mask) | + live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask]; + +/* if(pfn_type[j]>>29) + printf("i=%d type=%d\n",i,pfn_type[i]); */ + } + + + if ( !checked_write(gfd, &j, sizeof(int) ) ) + { + ERROR("Error when writing to state file"); + goto out; + } + + if ( !checked_write(gfd, pfn_type, sizeof(unsigned long)*j ) ) + { + ERROR("Error when writing to state file"); + goto out; + } + + + for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) + { + /* write out pages in batch */ + + if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) || + ((pfn_type[j] & PGT_type_mask) == L2TAB) ) + { + + memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE); + + for ( k = 0; + k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ? + (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); + k++ ) + { + if ( !(page[k] & _PAGE_PRESENT) ) continue; + mfn = page[k] >> PAGE_SHIFT; + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) + { + ERROR("Frame number in pagetable page is invalid"); + goto out; + } + page[k] &= PAGE_SIZE - 1; + page[k] |= live_mfn_to_pfn_table[mfn] << PAGE_SHIFT; + + } + + if ( !checked_write(gfd, page, PAGE_SIZE) ) + { + ERROR("Error when writing to state file"); + goto out; + } + + + } + else + { + if ( !checked_write(gfd, region_base + (PAGE_SIZE*j), PAGE_SIZE) ) + { + ERROR("Error when writing to state file"); + goto out; + } + } + } + + n+=j; /* i is the master loop counter */ } verbose_printf("\b\b\b\b100%%\nMemory saved.\n"); @@ -371,10 +467,19 @@ int xc_linux_save(int xc_handle, /* Success! */ rc = 0; - out: + /* Zero terminate */ + if ( !checked_write(gfd, &rc, sizeof(int)) ) + { + ERROR("Error when writing to state file"); + goto out; + } + + +out: /* Restart the domain if we had to stop it to save its state. */ if ( we_stopped_it ) { + printf("Restart domain\n"); op.cmd = DOM0_STARTDOMAIN; op.u.startdomain.domain = (domid_t)domid; (void)do_dom0_op(xc_handle, &op); @@ -382,13 +487,6 @@ int xc_linux_save(int xc_handle, gzclose(gfd); - if ( pm_handle >= 0 ) - (void)close_pfn_mapper(pm_handle); - - if ( pfn_to_mfn_table != NULL ) - free(pfn_to_mfn_table); - if ( mfn_to_pfn_table != NULL ) - free(mfn_to_pfn_table); if ( pfn_type != NULL ) free(pfn_type); @@ -397,4 +495,6 @@ int xc_linux_save(int xc_handle, unlink(state_file); return !!rc; + + } diff --git a/tools/xc/lib/xc_netbsd_build.c b/tools/xc/lib/xc_netbsd_build.c index 8793a512f2..7c67d57d71 100644 --- a/tools/xc/lib/xc_netbsd_build.c +++ b/tools/xc/lib/xc_netbsd_build.c @@ -27,6 +27,7 @@ static long get_tot_pages(int xc_handle, u64 domid) dom0_op_t op; op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)domid; + op.u.getdomaininfo.ctxt = NULL; return (do_dom0_op(xc_handle, &op) < 0) ? -1 : op.u.getdomaininfo.tot_pages; } @@ -59,7 +60,7 @@ static int setup_guestos(int xc_handle, unsigned long tot_pages, unsigned long *virt_startinfo_addr, unsigned long *virt_load_addr, - dom0_builddomain_t *builddomain, + full_execution_context_t *ctxt, const char *cmdline, unsigned long shared_info_frame, unsigned int control_evtchn) @@ -78,8 +79,6 @@ static int setup_guestos(int xc_handle, mmu_t *mmu = NULL; int pm_handle, i; - memset(builddomain, 0, sizeof(*builddomain)); - if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) goto error_out; @@ -119,7 +118,7 @@ static int setup_guestos(int xc_handle, */ l2tab = page_array[alloc_index] << PAGE_SHIFT; alloc_index--; - builddomain->ctxt.pt_base = l2tab; + ctxt->pt_base = l2tab; if ( (mmu = init_mmu_updates(xc_handle, dom)) == NULL ) goto error_out; @@ -221,7 +220,7 @@ int xc_netbsd_build(int xc_handle, int kernel_fd = -1; gzFile kernel_gfd = NULL; int rc, i; - full_execution_context_t *ctxt; + full_execution_context_t st_ctxt, *ctxt = &st_ctxt; unsigned long virt_startinfo_addr; if ( (tot_pages = get_tot_pages(xc_handle, domid)) < 0 ) @@ -244,8 +243,15 @@ int xc_netbsd_build(int xc_handle, return 1; } + if ( mlock(&st_ctxt, sizeof(st_ctxt) ) ) + { + PERROR("Unable to mlock ctxt"); + return 1; + } + op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)domid; + op.u.getdomaininfo.ctxt = ctxt; if ( (do_dom0_op(xc_handle, &op) < 0) || ((u64)op.u.getdomaininfo.domain != domid) ) { @@ -253,7 +259,7 @@ int xc_netbsd_build(int xc_handle, goto error_out; } if ( (op.u.getdomaininfo.state != DOMSTATE_STOPPED) || - (op.u.getdomaininfo.ctxt.pt_base != 0) ) + (op.u.getdomaininfo.ctxt->pt_base != 0) ) { ERROR("Domain is already constructed"); goto error_out; @@ -261,7 +267,7 @@ int xc_netbsd_build(int xc_handle, if ( setup_guestos(xc_handle, domid, kernel_gfd, tot_pages, &virt_startinfo_addr, - &load_addr, &launch_op.u.builddomain, cmdline, + &load_addr, &st_ctxt, cmdline, op.u.getdomaininfo.shared_info_frame, control_evtchn) < 0 ) { @@ -274,8 +280,6 @@ int xc_netbsd_build(int xc_handle, if( kernel_gfd ) gzclose(kernel_gfd); - ctxt = &launch_op.u.builddomain.ctxt; - ctxt->flags = 0; /* @@ -328,9 +332,11 @@ int xc_netbsd_build(int xc_handle, ctxt->failsafe_callback_cs = FLAT_GUESTOS_CS; ctxt->failsafe_callback_eip = 0; + memset( &launch_op, 0, sizeof(launch_op) ); + launch_op.u.builddomain.domain = (domid_t)domid; launch_op.u.builddomain.num_vifs = 1; - + launch_op.u.builddomain.ctxt = ctxt; launch_op.cmd = DOM0_BUILDDOMAIN; rc = do_dom0_op(xc_handle, &launch_op); diff --git a/tools/xc/lib/xc_private.c b/tools/xc/lib/xc_private.c index 485aa58754..d137176ca8 100644 --- a/tools/xc/lib/xc_private.c +++ b/tools/xc/lib/xc_private.c @@ -45,6 +45,234 @@ void unmap_pfn(int pm_handle, void *vaddr) (void)munmap(vaddr, PAGE_SIZE); } +/*******************/ + +void * mfn_mapper_map_single(int xc_handle, domid_t dom, + int size, int prot, + unsigned long mfn ) +{ + privcmd_mmap_t ioctlx; + privcmd_mmap_entry_t entry; + void *addr; + addr = mmap( NULL, size, prot, MAP_SHARED, xc_handle, 0 ); + if (addr) + { + ioctlx.num=1; + ioctlx.dom=dom; + ioctlx.entry=&entry; + entry.va=(unsigned long) addr; + entry.mfn=mfn; + entry.npages=(size+PAGE_SIZE-1)>>PAGE_SHIFT; + if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx ) <0 ) + return 0; + } + return addr; +} + +mfn_mapper_t * mfn_mapper_init(int xc_handle, domid_t dom, int size, int prot) +{ + mfn_mapper_t * t; + t = calloc( 1, sizeof(mfn_mapper_t)+ + mfn_mapper_queue_size*sizeof(privcmd_mmap_entry_t) ); + if (!t) return NULL; + t->xc_handle = xc_handle; + t->size = size; + t->prot = prot; + t->error = 0; + t->max_queue_size = mfn_mapper_queue_size; + t->addr = mmap( NULL, size, prot, MAP_SHARED, xc_handle, 0 ); + if (!t->addr) + { + free(t); + return NULL; + } + t->ioctl.num = 0; + t->ioctl.dom = dom; + t->ioctl.entry = (privcmd_mmap_entry_t *) &t[1]; + return t; +} + +void * mfn_mapper_base(mfn_mapper_t *t) +{ + return t->addr; +} + +void mfn_mapper_close(mfn_mapper_t *t) +{ + if(t->addr) munmap( t->addr, t->size ); + free(t); +} + +static int __mfn_mapper_flush_queue(mfn_mapper_t *t) +{ + int rc; + rc = ioctl( t->xc_handle, IOCTL_PRIVCMD_MMAP, &t->ioctl ); + t->ioctl.num = 0; + if(rc && !t->error) + t->error = rc; + return rc; +} + +int mfn_mapper_flush_queue(mfn_mapper_t *t) +{ + int rc; + + rc = __mfn_mapper_flush_queue(t); + + if ( t->error ) + { + rc = t->error; + } + + t->error = 0; + return rc; +} + +void * mfn_mapper_queue_entry(mfn_mapper_t *t, int offset, + unsigned long mfn, int size) +{ + privcmd_mmap_entry_t *entry, *prev; + int pages; + + offset &= PAGE_MASK; + pages =(size+PAGE_SIZE-1)>>PAGE_SHIFT; + entry = &t->ioctl.entry[t->ioctl.num]; + + if ( t->ioctl.num > 0 ) + { + prev = &t->ioctl.entry[t->ioctl.num-1]; + + if ( (prev->va+(prev->npages*PAGE_SIZE)) == + ((unsigned long)t->addr+offset) && + (prev->mfn+prev->npages) == mfn ) + { + prev->npages += pages; + return t->addr+offset; + } + } + + entry->va = (unsigned long)t->addr+offset; + entry->mfn = mfn; + entry->npages = pages; + t->ioctl.num++; + + if(t->ioctl.num == t->max_queue_size) + { + if ( __mfn_mapper_flush_queue(t) ) + return 0; + } + + return t->addr+offset; +} + + +/*******************/ + +#if 0 + +mfn_typer_t *mfn_typer_init(int xc_handle, domid_t dom, int num ) +{ + mfn_typer_t *t; + multicall_entry_t *m; + dom0_op_compact_getpageframeinfo_t *d; + + t = calloc(1, sizeof(mfn_typer_t) ); + m = calloc(num, sizeof(multicall_entry_t)); + d = calloc(num, sizeof(dom0_op_compact_getpageframeinfo_t)); + + if (!t || !m || !d) + { + if(t) free(t); + if(m) free(m); + if(d) free(d); + return NULL; + } + +printf("sizeof(m)=%d sizeof(d)=%d m=%p d=%p\n",sizeof(multicall_entry_t), sizeof(dom0_op_compact_getpageframeinfo_t),m,d); + + if ( (mlock(m, sizeof(multicall_entry_t)*num ) != 0) || + (mlock(d, sizeof(dom0_op_compact_getpageframeinfo_t)*num ) != 0) ) + { + PERROR("Could not lock memory for Xen hypercall"); + return NULL; + } + + t->xc_handle = xc_handle; + t->max = num; + t->nr_multicall_ents=0; + t->multicall_list=m; + t->gpf_list=d; + t->dom = dom; + + return t; +} + +void mfn_typer_queue_entry(mfn_typer_t *t, unsigned long mfn ) +{ + int i = t->nr_multicall_ents; + multicall_entry_t *m = &t->multicall_list[i]; + dom0_op_compact_getpageframeinfo_t *d = &t->gpf_list[i]; + + d->cmd = DOM0_GETPAGEFRAMEINFO; + d->interface_version = DOM0_INTERFACE_VERSION; + d->getpageframeinfo.pfn = mfn; + d->getpageframeinfo.domain = t->dom; + d->getpageframeinfo.type = 1000; //~0UL; + + m->op = __HYPERVISOR_dom0_op; + m->args[0] = (unsigned long)d; + + t->nr_multicall_ents++; +} + +int mfn_typer_flush_queue(mfn_typer_t *t) +{ + if (t->nr_multicall_ents == 0) return 0; + do_multicall_op(t->xc_handle, t->multicall_list, t->nr_multicall_ents); + t->nr_multicall_ents = 0; +} + +unsigned int mfn_typer_get_result(mfn_typer_t *t, int idx) +{ + return t->gpf_list[idx].getpageframeinfo.type; +} + +#endif + +/* NB: arr must be mlock'ed */ + +int get_pfn_type_batch(int xc_handle, + u64 dom, int num, unsigned long *arr) +{ + dom0_op_t op; + op.cmd = DOM0_GETPAGEFRAMEINFO2; + op.u.getpageframeinfo2.domain = (domid_t)dom; + op.u.getpageframeinfo2.num = num; + op.u.getpageframeinfo2.array = arr; + return do_dom0_op(xc_handle, &op); +} + +#define GETPFN_ERR (~0U) +unsigned int get_pfn_type(int xc_handle, + unsigned long mfn, + u64 dom) +{ + dom0_op_t op; + op.cmd = DOM0_GETPAGEFRAMEINFO; + op.u.getpageframeinfo.pfn = mfn; + op.u.getpageframeinfo.domain = (domid_t)dom; + if ( do_dom0_op(xc_handle, &op) < 0 ) + { + PERROR("Unexpected failure when getting page frame info!"); + return GETPFN_ERR; + } + return op.u.getpageframeinfo.type; +} + + + +/*******************/ + #define FIRST_MMU_UPDATE 2 static int flush_mmu_updates(int xc_handle, mmu_t *mmu) diff --git a/tools/xc/lib/xc_private.h b/tools/xc/lib/xc_private.h index d4299109e5..eaa301772d 100644 --- a/tools/xc/lib/xc_private.h +++ b/tools/xc/lib/xc_private.h @@ -16,8 +16,6 @@ #include "xc.h" -#include <asm-xen/proc_cmd.h> - /* from xen/include/hypervisor-ifs */ #include <hypervisor-if.h> #include <dom0_ops.h> @@ -25,6 +23,10 @@ #include <event_channel.h> #include <sched_ctl.h> +#include <asm-xen/proc_cmd.h> + + + /* from xend/lib */ #include <domain_controller.h> @@ -108,6 +110,27 @@ static inline int do_dom0_op(int xc_handle, dom0_op_t *op) out1: return ret; } +static inline int do_multicall_op(int xc_handle, + void *call_list, int nr_calls) +{ + int ret = -1; + privcmd_hypercall_t hypercall; + + hypercall.op = __HYPERVISOR_multicall; + hypercall.arg[0] = (unsigned long)call_list; + hypercall.arg[1] = (unsigned long)nr_calls; + + if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 ) + { + if ( errno == EACCES ) + fprintf(stderr, "Dom0 operation failed -- need to" + " rebuild the user-space tool set?\n"); + goto out1; + } + + out1: return ret; +} + static inline int do_network_op(int xc_handle, network_op_t *op) { int ret = -1; @@ -174,4 +197,77 @@ int add_mmu_update(int xc_handle, mmu_t *mmu, unsigned long ptr, unsigned long val); int finish_mmu_updates(int xc_handle, mmu_t *mmu); + +/* + * ioctl-based mfn mapping interface + */ + +/* +typedef struct privcmd_mmap_entry { + unsigned long va; + unsigned long mfn; + unsigned long npages; +} privcmd_mmap_entry_t; + +typedef struct privcmd_mmap { + int num; + domid_t dom; + privcmd_mmap_entry_t *entry; +} privcmd_mmap_t; +*/ + +#define mfn_mapper_queue_size 128 + +typedef struct mfn_mapper { + int xc_handle; + int size; + int prot; + int error; + int max_queue_size; + void * addr; + privcmd_mmap_t ioctl; + +} mfn_mapper_t; + +void * mfn_mapper_map_single(int xc_handle, domid_t dom, int size, int prot, + unsigned long mfn ); + +mfn_mapper_t * mfn_mapper_init(int xc_handle, domid_t dom, int size, int prot); + +void * mfn_mapper_base(mfn_mapper_t *t); + +void mfn_mapper_close(mfn_mapper_t *t); + +int mfn_mapper_flush_queue(mfn_mapper_t *t); + +void * mfn_mapper_queue_entry(mfn_mapper_t *t, int offset, + unsigned long mfn, int size ); + +/*********************/ + + +#if 0 +typedef struct mfn_typer { + domid_t dom; + int xc_handle; + int max; + dom0_op_t op; +} mfn_typer_t; + + +mfn_typer_t *mfn_typer_init(int xc_handle, domid_t dom, int num ); + +void mfn_typer_queue_entry(mfn_typer_t *t, unsigned long mfn ); + +int mfn_typer_flush_queue(mfn_typer_t *t); +#endif + +int get_pfn_type_batch(int xc_handle, + u64 dom, int num, unsigned long *arr); + +unsigned int get_pfn_type(int xc_handle, + unsigned long mfn, + u64 dom); + + #endif /* __XC_PRIVATE_H__ */ diff --git a/tools/xend/lib/blkif.py b/tools/xend/lib/blkif.py new file mode 100644 index 0000000000..94e058f7ce --- /dev/null +++ b/tools/xend/lib/blkif.py @@ -0,0 +1,143 @@ + +################################################################# +## xend/blkif.py -- Block-interface management functions for Xend +## Copyright (c) 2004, K A Fraser (University of Cambridge) +################################################################# + +import errno, re, os, select, signal, socket, struct, sys +import xend.main, xend.console, xend.manager, xend.utils, Xc + +CMSG_BLKIF_BE = 1 +CMSG_BLKIF_FE = 2 +CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED = 0 +CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED = 32 +CMSG_BLKIF_FE_INTERFACE_CONNECT = 33 +CMSG_BLKIF_FE_INTERFACE_DISCONNECT = 34 +CMSG_BLKIF_BE_CREATE = 0 +CMSG_BLKIF_BE_DESTROY = 1 +CMSG_BLKIF_BE_CONNECT = 2 +CMSG_BLKIF_BE_DISCONNECT = 3 +CMSG_BLKIF_BE_VBD_CREATE = 4 +CMSG_BLKIF_BE_VBD_DESTROY = 5 +CMSG_BLKIF_BE_VBD_GROW = 6 +CMSG_BLKIF_BE_VBD_SHRINK = 7 + +pendmsg = None +pendaddr = None + +def backend_tx_req(msg): + port = xend.main.dom0_port + if port.space_to_write_request(): + port.write_request(msg) + port.notify() + else: + xend.blkif.pendmsg = msg + +def backend_rx_req(port, msg): + port.write_response(msg) + +def backend_rx_rsp(port, msg): + subtype = (msg.get_header())['subtype'] + print "Received blkif-be response, subtype %d" % subtype + if subtype == CMSG_BLKIF_BE_CREATE: + rsp = { 'success': True } + xend.main.send_management_response(rsp, xend.blkif.pendaddr) + elif subtype == CMSG_BLKIF_BE_CONNECT: + (dom,hnd,evtchn,frame,st) = struct.unpack("QIILI", msg.get_payload()) + blkif = interface.list[xend.main.port_from_dom(dom).local_port] + msg = xend.utils.message(CMSG_BLKIF_FE, \ + CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED, 0) + msg.append_payload(struct.pack("III",0,2,blkif.evtchn['port2'])) + blkif.ctrlif_tx_req(xend.main.port_list[blkif.key], msg) + elif subtype == CMSG_BLKIF_BE_VBD_CREATE: + (dom,hnd,vdev,ro,st) = struct.unpack("QIHII", msg.get_payload()) + blkif = interface.list[xend.main.port_from_dom(dom).local_port] + (pdev, start_sect, nr_sect, readonly) = blkif.devices[vdev] + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_VBD_GROW, 0) + msg.append_payload(struct.pack("QIHHHQQI",dom,0,vdev,0, \ + pdev,start_sect,nr_sect,0)) + backend_tx_req(msg) + elif subtype == CMSG_BLKIF_BE_VBD_GROW: + rsp = { 'success': True } + xend.main.send_management_response(rsp, xend.blkif.pendaddr) + +def backend_do_work(port): + global pendmsg + if pendmsg and port.space_to_write_request(): + port.write_request(pendmsg) + pendmsg = None + return True + return False + + +class interface: + + # Dictionary of all block-device interfaces. + list = {} + + + # NB. 'key' is an opaque value that has no meaning in this class. + def __init__(self, dom, key): + self.dom = dom + self.key = key + self.devices = {} + self.pendmsg = None + interface.list[key] = self + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_CREATE, 0) + msg.append_payload(struct.pack("QII",dom,0,0)) + xend.blkif.pendaddr = xend.main.mgmt_req_addr + backend_tx_req(msg) + + # Attach a device to the specified interface + def attach_device(self, vdev, pdev, start_sect, nr_sect, readonly): + if self.devices.has_key(vdev): + return False + self.devices[vdev] = (pdev, start_sect, nr_sect, readonly) + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_VBD_CREATE, 0) + msg.append_payload(struct.pack("QIHII",self.dom,0,vdev,readonly,0)) + xend.blkif.pendaddr = xend.main.mgmt_req_addr + backend_tx_req(msg) + return True + + + # Completely destroy this interface. + def destroy(self): + del interface.list[self.key] + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_DESTROY, 0) + msg.append_payload(struct.pack("QII",self.dom,0,0)) + backend_tx_req(msg) + + + # The parameter @port is the control-interface event channel. This method + # returns True if messages were written to the control interface. + def ctrlif_transmit_work(self, port): + if self.pendmsg and port.space_to_write_request(): + port.write_request(self.pendmsg) + self.pendmsg = None + return True + return False + + def ctrlif_tx_req(self, port, msg): + if port.space_to_write_request(): + port.write_request(msg) + port.notify() + else: + self.pendmsg = msg + + def ctrlif_rx_req(self, port, msg): + port.write_response(msg) + subtype = (msg.get_header())['subtype'] + if subtype == CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED: + msg = xend.utils.message(CMSG_BLKIF_FE, \ + CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED, 0) + msg.append_payload(struct.pack("III",0,1,0)) + self.ctrlif_tx_req(port, msg) + elif subtype == CMSG_BLKIF_FE_INTERFACE_CONNECT: + (hnd,frame) = struct.unpack("IL", msg.get_payload()) + xc = Xc.new() + self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom) + msg = xend.utils.message(CMSG_BLKIF_BE, \ + CMSG_BLKIF_BE_CONNECT, 0) + msg.append_payload(struct.pack("QIILI",self.dom,0, \ + self.evtchn['port1'],frame,0)) + backend_tx_req(msg) diff --git a/tools/xend/lib/console.py b/tools/xend/lib/console.py index aad6069979..57898817f5 100644 --- a/tools/xend/lib/console.py +++ b/tools/xend/lib/console.py @@ -5,7 +5,7 @@ ############################################################# import errno, re, os, select, signal, socket, struct, sys - +import xend.blkif, xend.main, xend.manager, xend.utils, Xc ## ## interface: @@ -16,7 +16,7 @@ import errno, re, os, select, signal, socket, struct, sys ## CONNECTED: sending/receiving console data on TCP port 'self.port' ## ## A dictionary of all active interfaces, indexed by TCP socket descriptor, -## is accessible as 'interface.interface_list'. +## is accessible as 'interface.list_by_fd'. ## ## NB. When a class instance is to be destroyed you *must* call the 'close' ## method. Otherwise a stale reference will eb left in the interface list. @@ -30,7 +30,11 @@ class interface: # Dictionary of all active (non-closed) console interfaces. - interface_list = {} + list_by_fd = {} + + + # Dictionary of all console interfaces, closed and open. + list = {} # NB. 'key' is an opaque value that has no meaning in this class. @@ -38,6 +42,9 @@ class interface: self.status = interface.CLOSED self.port = port self.key = key + self.rbuf = xend.utils.buffer() + self.wbuf = xend.utils.buffer() + interface.list[key] = self # Is this interface closed (inactive)? @@ -58,14 +65,14 @@ class interface: # Close the interface, if it is not closed already. def close(self): if not self.closed(): - del interface.interface_list[self.sock.fileno()] + del interface.list_by_fd[self.sock.fileno()] self.sock.close() del self.sock self.status = interface.CLOSED # Move the interface into the 'listening' state. Opens a new listening - # socket and updates 'interface_list'. + # socket and updates 'list_by_fd'. def listen(self): # Close old socket (if any), and create a fresh one. self.close() @@ -80,7 +87,7 @@ class interface: # Announce the new status of thsi interface. self.status = interface.LISTENING - interface.interface_list[self.sock.fileno()] = self + interface.list_by_fd[self.sock.fileno()] = self except: # In case of trouble ensure we get rid of dangling socket reference @@ -105,7 +112,69 @@ class interface: # Publish the new socket and the new interface state. self.sock = sock self.status = interface.CONNECTED - interface.interface_list[self.sock.fileno()] = self + interface.list_by_fd[self.sock.fileno()] = self return 1 + # Completely sestroy a console interface. + def destroy(self): + self.close() + del interface.list[self.key] + + + # Do work triggered by resource availability on a console-interface socket. + def socket_work(self): + # If the interface is listening, check for pending connections. + if self.listening(): + self.connect() + + # All done if the interface is not connected. + if not self.connected(): + return + + # Send as much pending data as possible via the socket. + while not self.rbuf.empty(): + try: + bytes = self.sock.send(self.rbuf.peek()) + if bytes > 0: + self.rbuf.discard(bytes) + except socket.error, error: + pass + + # Read as much data as is available. Don't worry about + # overflowing our buffer: it's more important to read the + # incoming data stream and detect errors or closure of the + # remote end in a timely manner. + try: + while 1: + data = self.sock.recv(2048) + # Return of zero means the remote end has disconnected. + # We therefore return the console interface to listening. + if not data: + self.listen() + break + self.wbuf.write(data) + except socket.error, error: + # Assume that most errors mean that the connection is dead. + # In such cases we return the interface to 'listening' state. + if error[0] != errno.EAGAIN: + print "Better return to listening" + self.listen() + print "New status: " + str(self.status) + + + # The parameter @port is the control-interface event channel. This method + # returns True if messages were written to the control interface. + def ctrlif_transmit_work(self, port): + work_done = False + while not self.wbuf.empty() and port.space_to_write_request(): + msg = xend.utils.message(0, 0, 0) + msg.append_payload(self.wbuf.read(msg.MAX_PAYLOAD)) + port.write_request(msg) + work_done = True + return work_done + + + def ctrlif_rx_req(self, port, msg): + self.rbuf.write(msg.get_payload()) + port.write_response(msg) diff --git a/tools/xend/lib/domain_controller.h b/tools/xend/lib/domain_controller.h index d9ea7d6160..68d4fac1d2 100644 --- a/tools/xend/lib/domain_controller.h +++ b/tools/xend/lib/domain_controller.h @@ -76,8 +76,8 @@ typedef struct { /* Messages from guest to domain controller. */ #define CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED 32 -#define CMSG_BLKIF_FE_INTERFACE_UP 33 -#define CMSG_BLKIF_FE_INTERFACE_DOWN 34 +#define CMSG_BLKIF_FE_INTERFACE_CONNECT 33 +#define CMSG_BLKIF_FE_INTERFACE_DISCONNECT 34 /* These are used by both front-end and back-end drivers. */ #define blkif_vdev_t u16 @@ -91,13 +91,13 @@ typedef struct { * 1. The shared-memory frame is available for reuse. * 2. Any unacknowledged messgaes pending on the interface were dropped. */ -#define BLKIF_INTERFACE_STATUS_DESTROYED 0 /* Interface doesn't exist. */ -#define BLKIF_INTERFACE_STATUS_DOWN 1 /* Interface exists but is down. */ -#define BLKIF_INTERFACE_STATUS_UP 2 /* Interface exists and is up. */ +#define BLKIF_INTERFACE_STATUS_DESTROYED 0 /* Interface doesn't exist. */ +#define BLKIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */ +#define BLKIF_INTERFACE_STATUS_CONNECTED 2 /* Exists and is connected. */ typedef struct { unsigned int handle; unsigned int status; - unsigned int evtchn; /* status == BLKIF_INTERFACE_STATUS_UP */ + unsigned int evtchn; /* status == BLKIF_INTERFACE_STATUS_CONNECTED */ } blkif_fe_interface_status_changed_t; /* @@ -109,30 +109,37 @@ typedef struct { * If the driver goes DOWN while interfaces are still UP, the domain * will automatically take the interfaces DOWN. */ -#define BLKIF_DRIVER_STATUS_DOWN 0 -#define BLKIF_DRIVER_STATUS_UP 1 +#define BLKIF_DRIVER_STATUS_DOWN 0 +#define BLKIF_DRIVER_STATUS_UP 1 typedef struct { unsigned int status; /* BLKIF_DRIVER_STATUS_??? */ } blkif_fe_driver_status_changed_t; /* - * CMSG_BLKIF_FE_INTERFACE_UP: - * If successful, the domain controller will acknowledge with a STATUS_UP - * message. + * CMSG_BLKIF_FE_INTERFACE_CONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_CONNECTED message. */ typedef struct { unsigned int handle; unsigned long shmem_frame; -} blkif_fe_interface_up_t; +} blkif_fe_interface_connect_t; /* - * CMSG_BLKIF_FE_INTERFACE_DOWN: - * If successful, the domain controller will acknowledge with a STATUS_DOWN - * message. + * CMSG_BLKIF_FE_INTERFACE_DISCONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_DISCONNECTED message. */ typedef struct { + /* IN */ unsigned int handle; -} blkif_fe_interface_down_t; + /* OUT */ + /* + * Tells driver how many interfaces it should expect to immediately + * receive notifications about. + */ + unsigned int nr_interfaces; +} blkif_fe_interface_disconnect_t; /****************************************************************************** @@ -142,10 +149,12 @@ typedef struct { /* Messages from domain controller. */ #define CMSG_BLKIF_BE_CREATE 0 /* Create a new block-device interface. */ #define CMSG_BLKIF_BE_DESTROY 1 /* Destroy a block-device interface. */ -#define CMSG_BLKIF_BE_VBD_CREATE 2 /* Create a new VBD for an interface. */ -#define CMSG_BLKIF_BE_VBD_DESTROY 3 /* Delete a VBD from an interface. */ -#define CMSG_BLKIF_BE_VBD_GROW 4 /* Append an extent to a given VBD. */ -#define CMSG_BLKIF_BE_VBD_SHRINK 5 /* Remove last extent from a given VBD. */ +#define CMSG_BLKIF_BE_CONNECT 2 /* Connect i/f to remote driver. */ +#define CMSG_BLKIF_BE_DISCONNECT 3 /* Disconnect i/f from remote driver. */ +#define CMSG_BLKIF_BE_VBD_CREATE 4 /* Create a new VBD for an interface. */ +#define CMSG_BLKIF_BE_VBD_DESTROY 5 /* Delete a VBD from an interface. */ +#define CMSG_BLKIF_BE_VBD_GROW 6 /* Append an extent to a given VBD. */ +#define CMSG_BLKIF_BE_VBD_SHRINK 7 /* Remove last extent from a given VBD. */ /* Messages to domain controller. */ #define CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED 32 @@ -167,36 +176,36 @@ typedef struct { /* The following are specific error returns. */ #define BLKIF_BE_STATUS_INTERFACE_EXISTS 2 #define BLKIF_BE_STATUS_INTERFACE_NOT_FOUND 3 -#define BLKIF_BE_STATUS_VBD_EXISTS 4 -#define BLKIF_BE_STATUS_VBD_NOT_FOUND 5 -#define BLKIF_BE_STATUS_OUT_OF_MEMORY 6 -#define BLKIF_BE_STATUS_EXTENT_NOT_FOUND 7 -#define BLKIF_BE_STATUS_MAPPING_ERROR 8 +#define BLKIF_BE_STATUS_INTERFACE_CONNECTED 4 +#define BLKIF_BE_STATUS_VBD_EXISTS 5 +#define BLKIF_BE_STATUS_VBD_NOT_FOUND 6 +#define BLKIF_BE_STATUS_OUT_OF_MEMORY 7 +#define BLKIF_BE_STATUS_EXTENT_NOT_FOUND 8 +#define BLKIF_BE_STATUS_MAPPING_ERROR 9 /* This macro can be used to create an array of descriptive error strings. */ -#define BLKIF_BE_STATUS_ERRORS { \ - "Okay", \ - "Non-specific error", \ - "Interface already exists", \ - "Interface not found", \ - "VBD already exists", \ - "VBD not found", \ - "Out of memory", \ - "Extent not found for VBD", \ +#define BLKIF_BE_STATUS_ERRORS { \ + "Okay", \ + "Non-specific error", \ + "Interface already exists", \ + "Interface not found", \ + "Interface is still connected", \ + "VBD already exists", \ + "VBD not found", \ + "Out of memory", \ + "Extent not found for VBD", \ "Could not map domain memory" } /* * CMSG_BLKIF_BE_CREATE: * When the driver sends a successful response then the interface is fully - * set up. The controller will send an UP notification to the front-end + * created. The controller will send a DOWN notification to the front-end * driver. */ typedef struct { /* IN */ domid_t domid; /* Domain attached to new interface. */ unsigned int blkif_handle; /* Domain-specific interface handle. */ - unsigned int evtchn; /* Event channel for notifications. */ - unsigned long shmem_frame; /* Page cont. shared comms window. */ /* OUT */ unsigned int status; } blkif_be_create_t; @@ -204,8 +213,8 @@ typedef struct { /* * CMSG_BLKIF_BE_DESTROY: * When the driver sends a successful response then the interface is fully - * torn down. The controller will send a DOWN notification to the front-end - * driver. + * torn down. The controller will send a DESTROYED notification to the + * front-end driver. */ typedef struct { /* IN */ @@ -215,6 +224,36 @@ typedef struct { unsigned int status; } blkif_be_destroy_t; +/* + * CMSG_BLKIF_BE_CONNECT: + * When the driver sends a successful response then the interface is fully + * connected. The controller will send a CONNECTED notification to the + * front-end driver. + */ +typedef struct { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + unsigned int blkif_handle; /* Domain-specific interface handle. */ + unsigned int evtchn; /* Event channel for notifications. */ + unsigned long shmem_frame; /* Page cont. shared comms window. */ + /* OUT */ + unsigned int status; +} blkif_be_connect_t; + +/* + * CMSG_BLKIF_BE_DISCONNECT: + * When the driver sends a successful response then the interface is fully + * disconnected. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + unsigned int blkif_handle; /* Domain-specific interface handle. */ + /* OUT */ + unsigned int status; +} blkif_be_disconnect_t; + /* CMSG_BLKIF_BE_VBD_CREATE */ typedef struct { /* IN */ @@ -264,7 +303,14 @@ typedef struct { * will automatically send DOWN notifications. */ typedef struct { + /* IN */ unsigned int status; /* BLKIF_DRIVER_STATUS_??? */ + /* OUT */ + /* + * Tells driver how many interfaces it should expect to immediately + * receive notifications about. + */ + unsigned int nr_interfaces; } blkif_be_driver_status_changed_t; #endif /* __DOMAIN_CONTROLLER_H__ */ diff --git a/tools/xend/lib/main.py b/tools/xend/lib/main.py index b870af55d1..7b5adbab83 100755 --- a/tools/xend/lib/main.py +++ b/tools/xend/lib/main.py @@ -5,7 +5,7 @@ ########################################################### import errno, re, os, pwd, select, signal, socket, struct, sys, time -import xend.console, xend.manager, xend.utils, Xc +import xend.blkif, xend.console, xend.manager, xend.utils, Xc # The following parameters could be placed in a configuration file. @@ -16,13 +16,35 @@ CONTROL_DIR = '/var/run/xend' UNIX_SOCK = 'management_sock' # relative to CONTROL_DIR +CMSG_CONSOLE = 0 +CMSG_BLKIF_BE = 1 +CMSG_BLKIF_FE = 2 + + +def port_from_dom(dom): + global port_list + for idx, port in port_list.items(): + if port.remote_dom == dom: + return port + return None + + +def send_management_response(response, addr): + try: + response = str(response) + print "Mgmt_rsp[%s]: %s" % (addr, response) + management_interface.sendto(response, addr) + except socket.error, error: + pass + + def daemon_loop(): # Could we do this more nicely? The xend.manager functions need access # to this global state to do their work. - global control_list, notifier + global port_list, notifier, management_interface, mgmt_req_addr, dom0_port - # List of all control interfaces, indexed by local event-channel port. - control_list = {} + # Lists of all interfaces, indexed by local event-channel port. + port_list = {} xc = Xc.new() @@ -46,13 +68,10 @@ def daemon_loop(): # The DOM0 control interface is not set up via the management interface. # Note that console messages don't come our way (actually, only driver - # back-ends should use the DOM0 control interface) -- the console - # structures are dummies. + # back-ends should use the DOM0 control interface). dom0_port = xend.utils.port(0) - xend.main.notifier.bind(dom0_port.local_port) - xend.main.control_list[dom0_port.local_port] = \ - (dom0_port, xend.utils.buffer(), xend.utils.buffer(), \ - xend.console.interface(0, dom0_port.local_port)) + notifier.bind(dom0_port.local_port) + port_list[dom0_port.local_port] = dom0_port ## ## MAIN LOOP @@ -68,10 +87,10 @@ def daemon_loop(): waitset = select.poll() waitset.register(management_interface, select.POLLIN) waitset.register(notifier, select.POLLIN) - for idx, (port, rbuf, wbuf, con_if) in control_list.items(): + for idx, con_if in xend.console.interface.list_by_fd.items(): if not con_if.closed(): pflags = select.POLLIN - if not rbuf.empty() and con_if.connected(): + if not con_if.rbuf.empty() and con_if.connected(): pflags = select.POLLIN | select.POLLOUT waitset.register(con_if.sock.fileno(), pflags) @@ -82,16 +101,16 @@ def daemon_loop(): # These should consist of executable Python statements that call # well-known management functions (e.g., new_control_interface(dom=9)). try: - data, addr = management_interface.recvfrom(2048) + data, mgmt_req_addr = management_interface.recvfrom(2048) except socket.error, error: if error[0] != errno.EAGAIN: raise else: - if addr: + if mgmt_req_addr: # Evaluate the request in an exception-trapping sandbox. try: - print "Mgmt_req[%s]: %s" % (addr, data) - response = str(eval('xend.manager.'+data)) + print "Mgmt_req[%s]: %s" % (mgmt_req_addr, data) + response = eval('xend.manager.'+data) except: # Catch all exceptions and turn into an error response: @@ -107,69 +126,20 @@ def daemon_loop(): response = str(response) # Try to send a response to the requester. - try: - print "Mgmt_rsp[%s]: %s" % (addr, response) - management_interface.sendto(response, addr) - except socket.error, error: - pass + if response: + send_management_response(response, mgmt_req_addr) # Do work for every console interface that hit in the poll set. for (fd, events) in fdset: - if not xend.console.interface.interface_list.has_key(fd): - continue - con_if = xend.console.interface.interface_list[fd] - - # If the interface is listening, check for pending connections. - if con_if.listening(): - con_if.connect() - - # All done if the interface is not connected. - if not con_if.connected(): - continue - (port, rbuf, wbuf, con_if) = control_list[con_if.key] - - # Send as much pending data as possible via the socket. - while not rbuf.empty(): - try: - bytes = con_if.sock.send(rbuf.peek()) - if bytes > 0: - rbuf.discard(bytes) - except socket.error, error: - pass - - # Read as much data as is available. Don't worry about - # overflowing our buffer: it's more important to read the - # incoming data stream and detect errors or closure of the - # remote end in a timely manner. - try: - while 1: - data = con_if.sock.recv(2048) - # Return of zero means the remote end has disconnected. - # We therefore return the console interface to listening. - if not data: - con_if.listen() - break - wbuf.write(data) - except socket.error, error: - # Assume that most errors mean that the connection is dead. - # In such cases we return the interface to 'listening' state. - if error[0] != errno.EAGAIN: - print "Better return to listening" - con_if.listen() - print "New status: " + str(con_if.status) - - # We may now have pending data to send via the relevant - # inter-domain control interface. If so then we send all we can - # and notify the remote end. - work_done = False - while not wbuf.empty() and port.space_to_write_request(): - msg = xend.utils.message(0, 0, 0) - msg.append_payload(wbuf.read(msg.MAX_PAYLOAD)) - port.write_request(msg) - work_done = True - if work_done: - port.notify() - + if xend.console.interface.list_by_fd.has_key(fd): + con_if = xend.console.interface.list_by_fd[fd] + con_if.socket_work() + # We may now have pending data to send via the control + # interface. If so then send all we can and notify the remote. + port = port_list[con_if.key] + if con_if.ctrlif_transmit_work(port): + port.notify() + # Process control-interface notifications from other guest OSes. while 1: # Grab a notification, if there is one. @@ -178,42 +148,69 @@ def daemon_loop(): break (idx, type) = notification - if not control_list.has_key(idx): + if not port_list.has_key(idx): continue - (port, rbuf, wbuf, con_if) = control_list[idx] + port = port_list[idx] work_done = False + con_if = False + if xend.console.interface.list.has_key(idx): + con_if = xend.console.interface.list[idx] + + blk_if = False + if xend.blkif.interface.list.has_key(idx): + blk_if = xend.blkif.interface.list[idx] + # If we pick up a disconnect notification then we do any necessary # cleanup. if type == notifier.EXCEPTION: ret = xc.evtchn_status(idx) if ret['status'] == 'unbound': notifier.unbind(idx) - con_if.close() - del control_list[idx], port, rbuf, wbuf, con_if + del port_list[idx], port + if con_if: + con_if.destroy() + del con_if + if blk_if: + blk_if.destroy() + del blk_if continue - # Read incoming requests. Currently assume that request - # message always containb console data. + # Process incoming requests. while port.request_to_read(): msg = port.read_request() - rbuf.write(msg.get_payload()) - port.write_response(msg) work_done = True - - # Incoming responses are currently thrown on the floor. + type = (msg.get_header())['type'] + if type == CMSG_CONSOLE and con_if: + con_if.ctrlif_rx_req(port, msg) + elif type == CMSG_BLKIF_FE and blk_if: + blk_if.ctrlif_rx_req(port, msg) + elif type == CMSG_BLKIF_BE and port == dom0_port: + xend.blkif.backend_rx_req(port, msg) + else: + port.write_response(msg) + + # Process incoming responses. while port.response_to_read(): msg = port.read_response() work_done = True + type = (msg.get_header())['type'] + if type == CMSG_BLKIF_BE and port == dom0_port: + xend.blkif.backend_rx_rsp(port, msg) + + # Send console data. + if con_if and con_if.ctrlif_transmit_work(port): + work_done = True - # Send as much pending console data as there is room for. - while not wbuf.empty() and port.space_to_write_request(): - msg = xend.utils.message(0, 0, 0) - msg.append_payload(wbuf.read(msg.MAX_PAYLOAD)) - port.write_request(msg) + # Send blkif messages. + if blk_if and blk_if.ctrlif_transmit_work(port): work_done = True + # Back-end block-device work. + if port == dom0_port and xend.blkif.backend_do_work(port): + work_done = True + # Finally, notify the remote end of any work that we did. if work_done: port.notify() diff --git a/tools/xend/lib/manager.py b/tools/xend/lib/manager.py index 42d66d3a95..ea7398cd4c 100644 --- a/tools/xend/lib/manager.py +++ b/tools/xend/lib/manager.py @@ -4,13 +4,13 @@ ## Copyright (c) 2004, K A Fraser (University of Cambridge) ############################################################# -import xend.console, xend.main, xend.utils +import xend.blkif, xend.console, xend.main, xend.utils ## ## new_control_interface: -## Create a new control interface with the specified domain 'dom'. -## The console port may also be specified; otehrwise a suitable port is +## Create a new control interface with the specified domain @dom. +## The console port may also be specified; otherwise a suitable port is ## automatically allocated. ## def new_control_interface(dom, console_port=-1): @@ -26,9 +26,8 @@ def new_control_interface(dom, console_port=-1): con_if = xend.console.interface(console_port, port.local_port) con_if.listen() - # Add control state to the master list. - xend.main.control_list[port.local_port] = \ - (port, xend.utils.buffer(), xend.utils.buffer(), con_if) + # Update the master port list. + xend.main.port_list[port.local_port] = port # Construct the successful response to be returned to the requester. response = { 'success': True } @@ -36,3 +35,81 @@ def new_control_interface(dom, console_port=-1): response['remote_port'] = port.remote_port response['console_port'] = console_port return response + + +## +## new_block_interface: +## Create a new block interface for the specified domain @dom. +## +def new_block_interface(dom, handle=-1): + # By default we create an interface with handle zero. + if handle < 0: + handle = 0 + + # We only support one interface per domain, which must have handle zero. + if handle != 0: + response = { 'success': False } + response['error_type'] = 'Bad handle %d (only handle 0 ' + \ + 'is supported)' % handle + return response + + # Find local event-channel port associated with the specified domain. + port = xend.main.port_from_dom(dom) + if not port: + response = { 'success': False } + response['error_type'] = 'Unknown domain %d' % dom + return response + + # The interface must not already exist. + if xend.blkif.interface.list.has_key(port.local_port): + response = { 'success': False } + response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \ + 'exists' % (dom, handle) + return response + + # Create the new interface. Initially no virtual devices are attached. + xend.blkif.interface(dom, port.local_port) + + # Response is deferred until back-end driver sends acknowledgement. + return None + + +## +## new_block_device: +## Attach a new virtual block device to the specified block interface +## (@dom, @handle). The new device is identified by @vdev, and maps to +## the real block extent (@pdev, @start_sect, @nr_sect). If @readonly then +## write requests to @vdev will be rejected. +## +def new_block_device(dom, handle, vdev, pdev, start_sect, nr_sect, readonly): + # We only support one interface per domain, which must have handle zero. + if handle != 0: + response = { 'success': False } + response['error_type'] = 'Bad handle %d (only handle 0 ' + \ + 'is supported)' % handle + return response + + # Find local event-channel port associated with the specified domain. + port = xend.main.port_from_dom(dom) + if not port: + response = { 'success': False } + response['error_type'] = 'Unknown domain %d' % dom + return response + + # The interface must exist. + if not xend.blkif.interface.list.has_key(port.local_port): + response = { 'success': False } + response['error_type'] = 'Interface (dom=%d,handle=%d) does not ' + \ + 'exists' % (dom, handle) + return response + + # The virtual device must not yet exist. + blkif = xend.blkif.interface.list[port.local_port] + if not blkif.attach_device(vdev, pdev, start_sect, nr_sect, readonly): + response = { 'success': False } + response['error_type'] = 'Vdevice (dom=%d,handle=%d,vdevice=%d) ' + \ + 'already exists' % (dom, handle, vdev) + return response + + # Response is deferred until back-end driver sends acknowledgement. + return None diff --git a/tools/xend/lib/utils.c b/tools/xend/lib/utils.c index c28d682ec9..297976e9be 100644 --- a/tools/xend/lib/utils.c +++ b/tools/xend/lib/utils.c @@ -22,11 +22,11 @@ #include <signal.h> #include <xc.h> -#include <asm-xen/proc_cmd.h> - #include <hypervisor-if.h> #include "domain_controller.h" +#include <asm-xen/proc_cmd.h> + /* Needed for Python versions earlier than 2.3. */ #ifndef PyMODINIT_FUNC #define PyMODINIT_FUNC DL_EXPORT(void) |