diff options
36 files changed, 1742 insertions, 937 deletions
@@ -102,6 +102,7 @@ 4055ee44Bu6oP7U0WxxXypbUt4dNPQ tools/xenctl/setup.py 40431ac64Hj4ixUnKmlugZKhXPFE_Q tools/xend/Makefile 4055ad95Se-FqttgxollqOAAHB94zA tools/xend/lib/__init__.py +4092738fMRGC9fFBcPRCWaJaj9U3ag tools/xend/lib/blkif.py 4055ad97wMLUj0BZT0e_T0EwQN0Bvw tools/xend/lib/console.py 4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h 4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py @@ -665,8 +666,8 @@ 4087cf0dlv1Dw4MAbeRStPPG8IvPPg xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c 40880cc6hHg6s2cPHbqPNQxENefjoQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h 4075806dI5kfeMD5RV-DA0PYoThx_w xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/Makefile -4075806d3fJqqDC1pYYPTZPc575iKg xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.c -4075806d4-j7vN0Mn0bklI1cRUX1vQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.h +4075806d4-j7vN0Mn0bklI1cRUX1vQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/common.h +4075806d3fJqqDC1pYYPTZPc575iKg xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c 4075806dibjCcfuXv6CINMhxWTw3jQ xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c 3e5a4e65iHEuC5sjFhj42XALYbLVRw xenolinux-2.4.26-sparse/arch/xen/drivers/block/Makefile 3e5a4e65pP5spJErBW69pJxSSdK9RA xenolinux-2.4.26-sparse/arch/xen/drivers/block/block.c diff --git a/tools/examples/xc_dom_create.py b/tools/examples/xc_dom_create.py index 799319c6a6..0fae2b251b 100755 --- a/tools/examples/xc_dom_create.py +++ b/tools/examples/xc_dom_create.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import string, sys, os, time, socket, getopt, signal, syslog -import Xc, xenctl.utils, xenctl.console_client +import Xc, xenctl.utils, xenctl.console_client, re config_dir = '/etc/xc/' config_file = xc_config_file = config_dir + 'defaults' @@ -195,6 +195,15 @@ output('VM cmdline : "%s"' % cmdline) if dryrun: sys.exit(1) +##### HACK HACK HACK +##### Until everyone moves to the new I/O world, and a more robust domain +##### controller (xend), we use this little trick to discover whether we +##### are in a testing environment for new I/O stuff. +new_io_world = True +for line in os.popen('cat /proc/interrupts').readlines(): + if re.search('blkdev', line): + new_io_world = False + ##### Code beyond this point is actually used to manage the mechanics of ##### starting (and watching if necessary) guest virtual machines. @@ -228,19 +237,23 @@ def make_domain(): cmsg = 'new_control_interface(dom='+str(id)+', console_port='+str(console_port)+')' - xend_response = xenctl.utils.xend_control_message(cmsg) + cons_response = xenctl.utils.xend_control_message(cmsg) - if not xend_response['success']: + if not cons_response['success']: print "Error creating initial event channel" - print "Error type: " + xend_response['error_type'] - if xend_response['error_type'] == 'exception': - print "Exception type: " + xend_response['exception_type'] - print "Exception value: " + xend_response['exception_value'] + print "Error type: " + cons_response['error_type'] + if cons_response['error_type'] == 'exception': + print "Exception type: " + cons_response['exception_type'] + print "Exception value: " + cons_response['exception_value'] xc.domain_destroy ( dom=id ) sys.exit() + # will the domain have IO privileges? + if pci_device_list != []: io_priv = True + else: io_priv = False + if restore: - ret = eval('xc.%s_restore ( dom=id, state_file=state_file, progress=1 )' % builder_fn) + ret = eval('xc.%s_restore ( dom=id, state_file=state_file, progress=1, io_priv=%d )' % (builder_fn, io_priv)) if ret < 0: print "Error restoring domain" print "Return code = " + str(ret) @@ -248,7 +261,7 @@ def make_domain(): sys.exit() else: - ret = eval('xc.%s_build ( dom=id, image=image, ramdisk=ramdisk, cmdline=cmdline, control_evtchn=xend_response["remote_port"] )' % builder_fn) + ret = eval('xc.%s_build ( dom=id, image=image, ramdisk=ramdisk, cmdline=cmdline, control_evtchn=cons_response["remote_port"], io_priv=%d )' % (builder_fn, io_priv) ) if ret < 0: print "Error building Linux guest OS: " print "Return code = " + str(ret) @@ -259,6 +272,18 @@ def make_domain(): # set the expertise level appropriately xenctl.utils.VBD_EXPERT_MODE = vbd_expert + + if new_io_world: + cmsg = 'new_block_interface(dom='+str(id)+')' + xend_response = xenctl.utils.xend_control_message(cmsg) + if not xend_response['success']: + print "Error creating block interface" + print "Error type: " + xend_response['error_type'] + if xend_response['error_type'] == 'exception': + print "Exception type: " + xend_response['exception_type'] + print "Exception val: " + xend_response['exception_value'] + xc.domain_destroy ( dom=id ) + sys.exit() for ( uname, virt_name, rw ) in vbd_list: virt_dev = xenctl.utils.blkdev_name_to_number( virt_name ) @@ -269,42 +294,70 @@ def make_domain(): xc.domain_destroy ( dom=id ) sys.exit() - # check that setting up this VBD won't violate the sharing - # allowed by the current VBD expertise level - if xenctl.utils.vd_extents_validate(segments, rw=='w' or rw=='rw') < 0: - xc.domain_destroy( dom = id ) - sys.exit() + if new_io_world: + if len(segments) > 1: + print "New I/O world cannot deal with multi-extent vdisks" + xc.domain_destroy ( dom=id ) + sys.exit() + seg = segments[0] + cmsg = 'new_block_device(dom=' + str(id) + \ + ',handle=0,vdev=' + str(virt_dev) + \ + ',pdev=' + str(seg['device']) + \ + ',start_sect=' + str(seg['start_sector']) + \ + ',nr_sect=' + str(seg['nr_sectors']) + \ + ',readonly=' + str(not re.match('w',rw)) + ')' + xend_response = xenctl.utils.xend_control_message(cmsg) + if not xend_response['success']: + print "Error creating virtual block device" + print "Error type: " + xend_response['error_type'] + if xend_response['error_type'] == 'exception': + print "Exception type: " + xend_response['exception_type'] + print "Exception val: " + xend_response['exception_value'] + xc.domain_destroy ( dom=id ) + sys.exit() + else: + # check that setting up this VBD won't violate the sharing + # allowed by the current VBD expertise level + if xenctl.utils.vd_extents_validate(segments, + rw=='w' or rw=='rw') < 0: + xc.domain_destroy( dom = id ) + sys.exit() - if xc.vbd_create( dom=id, vbd=virt_dev, writeable= rw=='w' or rw=='rw' ): - print "Error creating VBD vbd=%d writeable=%d\n" % (virt_dev,rw) - xc.domain_destroy ( dom=id ) - sys.exit() + if xc.vbd_create( dom=id, vbd=virt_dev, + writeable= rw=='w' or rw=='rw' ): + print "Error creating VBD %d (writeable=%d)\n" % (virt_dev,rw) + xc.domain_destroy ( dom=id ) + sys.exit() - if xc.vbd_setextents( dom=id, - vbd=virt_dev, - extents=segments): - print "Error populating VBD vbd=%d\n" % virt_dev - xc.domain_destroy ( dom=id ) - sys.exit() - - # setup virtual firewall rules for all aliases - for ip in vfr_ipaddr: - xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip ) - - # check for physical device access - for (pci_bus, pci_dev, pci_func) in pci_device_list: - if xc.physdev_pci_access_modify( - dom=id, bus=pci_bus, dev=pci_dev, func=pci_func, enable=1 ) < 0: - print "Non-fatal error enabling PCI device access." - else: - print "Enabled PCI access (%d:%d:%d)." % (pci_bus,pci_dev,pci_func) + if xc.vbd_setextents( dom=id, + vbd=virt_dev, + extents=segments): + print "Error populating VBD vbd=%d\n" % virt_dev + xc.domain_destroy ( dom=id ) + sys.exit() + + if not new_io_world: + # setup virtual firewall rules for all aliases + for ip in vfr_ipaddr: + xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip ) + + if new_io_world: + # check for physical device access + for (pci_bus, pci_dev, pci_func) in pci_device_list: + if xc.physdev_pci_access_modify( + dom=id, bus=pci_bus, dev=pci_dev, + func=pci_func, enable=1 ) < 0: + print "Non-fatal error enabling PCI device access." + else: + print "Enabled PCI access (%d:%d:%d)." % \ + (pci_bus,pci_dev,pci_func) if xc.domain_start( dom=id ) < 0: print "Error starting domain" xc.domain_destroy ( dom=id ) sys.exit() - return (id, xend_response['console_port']) + return (id, cons_response['console_port']) # end of make_domain() def mkpidfile(): diff --git a/tools/xc/lib/xc.h b/tools/xc/lib/xc.h index 4afb905955..eb1b07da91 100644 --- a/tools/xc/lib/xc.h +++ b/tools/xc/lib/xc.h @@ -64,7 +64,7 @@ int xc_linux_save(int xc_handle, int verbose); int xc_linux_restore(int xc_handle, - u64 domid, + u64 domid, const char *state_file, int verbose, u64 *pdomid); @@ -74,13 +74,15 @@ int xc_linux_build(int xc_handle, const char *image_name, const char *ramdisk_name, const char *cmdline, - unsigned int control_evtchn); + unsigned int control_evtchn, + int io_priv); int xc_netbsd_build(int xc_handle, u64 domid, const char *image_name, const char *cmdline, - unsigned int control_evtchn); + unsigned int control_evtchn, + int io_priv); int xc_bvtsched_global_set(int xc_handle, unsigned long ctx_allow); @@ -248,15 +250,15 @@ int xc_shadow_control(int xc_handle, int xc_domain_setname(int xc_handle, u64 domid, - char *name); + char *name); int xc_domain_setinitialmem(int xc_handle, - u64 domid, - unsigned int initial_memkb); + u64 domid, + unsigned int initial_memkb); int xc_domain_setmaxmem(int xc_handle, - u64 domid, - unsigned int max_memkb); + u64 domid, + unsigned int max_memkb); #endif /* __XC_H__ */ diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c index 42696666a8..67351210fc 100644 --- a/tools/xc/lib/xc_linux_build.c +++ b/tools/xc/lib/xc_linux_build.c @@ -74,7 +74,8 @@ static int setup_guestos(int xc_handle, full_execution_context_t *ctxt, const char *cmdline, unsigned long shared_info_frame, - unsigned int control_evtchn) + unsigned int control_evtchn, + int io_priv) { l1_pgentry_t *vl1tab=NULL, *vl1e=NULL; l2_pgentry_t *vl2tab=NULL, *vl2e=NULL; @@ -268,7 +269,7 @@ static int setup_guestos(int xc_handle, memset(start_info, 0, sizeof(*start_info)); start_info->nr_pages = nr_pages; start_info->shared_info = shared_info_frame << PAGE_SHIFT; - start_info->flags = 0; + start_info->flags = io_priv ? SIF_PRIVILEGED : 0; start_info->pt_base = vpt_start; start_info->nr_pt_frames = nr_pt_pages; start_info->mfn_list = vphysmap_start; @@ -381,7 +382,8 @@ int xc_linux_build(int xc_handle, const char *image_name, const char *ramdisk_name, const char *cmdline, - unsigned int control_evtchn) + unsigned int control_evtchn, + int io_priv) { dom0_op_t launch_op, op; int initrd_fd = -1; @@ -446,7 +448,7 @@ int xc_linux_build(int xc_handle, &vstartinfo_start, &vkern_entry, ctxt, cmdline, op.u.getdomaininfo.shared_info_frame, - control_evtchn) < 0 ) + control_evtchn, io_priv) < 0 ) { ERROR("Error constructing guest OS"); goto error_out; @@ -560,13 +562,13 @@ static int readelfimage_base_and_size(char *elfbase, if ( (ehdr->e_phoff + (ehdr->e_phnum * ehdr->e_phentsize)) > elfsize ) { - ERROR("ELF program headers extend beyond end of image."); + ERROR("ELF program headers extend beyond end of image."); return -EINVAL; } if ( (ehdr->e_shoff + (ehdr->e_shnum * ehdr->e_shentsize)) > elfsize ) { - ERROR("ELF section headers extend beyond end of image."); + ERROR("ELF section headers extend beyond end of image."); return -EINVAL; } @@ -642,7 +644,7 @@ static int loadelfimage(char *elfbase, int pmh, unsigned long *parray, { phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize)); if ( !is_loadable_phdr(phdr) ) - continue; + continue; for ( done = 0; done < phdr->p_filesz; done += chunksz ) { diff --git a/tools/xc/lib/xc_netbsd_build.c b/tools/xc/lib/xc_netbsd_build.c index 7c67d57d71..8260c75ea4 100644 --- a/tools/xc/lib/xc_netbsd_build.c +++ b/tools/xc/lib/xc_netbsd_build.c @@ -63,7 +63,8 @@ static int setup_guestos(int xc_handle, full_execution_context_t *ctxt, const char *cmdline, unsigned long shared_info_frame, - unsigned int control_evtchn) + unsigned int control_evtchn, + int io_priv) { l1_pgentry_t *vl1tab=NULL, *vl1e=NULL; l2_pgentry_t *vl2tab=NULL, *vl2e=NULL; @@ -175,7 +176,7 @@ static int setup_guestos(int xc_handle, start_info->mod_len = symtab_len; start_info->nr_pages = tot_pages; start_info->shared_info = shared_info_frame << PAGE_SHIFT; - start_info->flags = 0; + start_info->flags = io_priv ? SIF_PRIVILEGED : 0; start_info->domain_controller_evtchn = control_evtchn; strncpy(start_info->cmd_line, cmdline, MAX_CMDLINE); start_info->cmd_line[MAX_CMDLINE-1] = '\0'; @@ -212,7 +213,8 @@ int xc_netbsd_build(int xc_handle, u64 domid, const char *image_name, const char *cmdline, - unsigned int control_evtchn) + unsigned int control_evtchn, + int io_priv) { dom0_op_t launch_op, op; unsigned long load_addr; @@ -269,7 +271,7 @@ int xc_netbsd_build(int xc_handle, &virt_startinfo_addr, &load_addr, &st_ctxt, cmdline, op.u.getdomaininfo.shared_info_frame, - control_evtchn) < 0 ) + control_evtchn, io_priv) < 0 ) { ERROR("Error constructing guest OS"); goto error_out; diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c index 92f77f7051..322a20b411 100644 --- a/tools/xc/py/Xc.c +++ b/tools/xc/py/Xc.c @@ -228,18 +228,19 @@ static PyObject *pyxc_linux_build(PyObject *self, u64 dom; char *image, *ramdisk = NULL, *cmdline = ""; - int control_evtchn; + int control_evtchn, io_priv = 0; static char *kwd_list[] = { "dom", "control_evtchn", - "image", "ramdisk", "cmdline", NULL }; + "image", "ramdisk", "cmdline", "io_priv", + NULL }; - if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lis|ss", kwd_list, + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lis|ssi", kwd_list, &dom, &control_evtchn, - &image, &ramdisk, &cmdline) ) + &image, &ramdisk, &cmdline, &io_priv) ) return NULL; if ( xc_linux_build(xc->xc_handle, dom, image, - ramdisk, cmdline, control_evtchn) != 0 ) + ramdisk, cmdline, control_evtchn, io_priv) != 0 ) return PyErr_SetFromErrno(xc_error); Py_INCREF(zero); @@ -254,18 +255,19 @@ static PyObject *pyxc_netbsd_build(PyObject *self, u64 dom; char *image, *ramdisk = NULL, *cmdline = ""; - int control_evtchn; + int control_evtchn, io_priv = 0; static char *kwd_list[] = { "dom", "control_evtchn", - "image", "ramdisk", "cmdline", NULL }; + "image", "ramdisk", "cmdline", "io_priv", + NULL }; - if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lis|ss", kwd_list, + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lis|ssi", kwd_list, &dom, &control_evtchn, - &image, &ramdisk, &cmdline) ) + &image, &ramdisk, &cmdline, &io_priv) ) return NULL; if ( xc_netbsd_build(xc->xc_handle, dom, image, - cmdline, control_evtchn) != 0 ) + cmdline, control_evtchn, io_priv) != 0 ) return PyErr_SetFromErrno(xc_error); Py_INCREF(zero); @@ -1160,7 +1162,8 @@ static PyMethodDef pyxc_methods[] = { " dom [long]: Identifier of domain to build into.\n" " image [str]: Name of kernel image file. May be gzipped.\n" " ramdisk [str, n/a]: Name of ramdisk file, if any.\n" - " cmdline [str, n/a]: Kernel parameters, if any.\n\n" + " cmdline [str, n/a]: Kernel parameters, if any.\n" + " io_priv [boolean]: Does the domain have IO privileges?\n\n" "Returns: [int] 0 on success; -1 on error.\n" }, { "netbsd_build", @@ -1169,7 +1172,8 @@ static PyMethodDef pyxc_methods[] = { "Build a new NetBSD guest OS.\n" " dom [long]: Identifier of domain to build into.\n" " image [str]: Name of kernel image file. May be gzipped.\n" - " cmdline [str, n/a]: Kernel parameters, if any.\n\n" + " cmdline [str, n/a]: Kernel parameters, if any.\n" + " io_priv [boolean]: Does the domain have IO privileges?\n\n" "Returns: [int] 0 on success; -1 on error.\n" }, { "bvtsched_global_set", diff --git a/tools/xend/lib/blkif.py b/tools/xend/lib/blkif.py new file mode 100644 index 0000000000..94e058f7ce --- /dev/null +++ b/tools/xend/lib/blkif.py @@ -0,0 +1,143 @@ + +################################################################# +## xend/blkif.py -- Block-interface management functions for Xend +## Copyright (c) 2004, K A Fraser (University of Cambridge) +################################################################# + +import errno, re, os, select, signal, socket, struct, sys +import xend.main, xend.console, xend.manager, xend.utils, Xc + +CMSG_BLKIF_BE = 1 +CMSG_BLKIF_FE = 2 +CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED = 0 +CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED = 32 +CMSG_BLKIF_FE_INTERFACE_CONNECT = 33 +CMSG_BLKIF_FE_INTERFACE_DISCONNECT = 34 +CMSG_BLKIF_BE_CREATE = 0 +CMSG_BLKIF_BE_DESTROY = 1 +CMSG_BLKIF_BE_CONNECT = 2 +CMSG_BLKIF_BE_DISCONNECT = 3 +CMSG_BLKIF_BE_VBD_CREATE = 4 +CMSG_BLKIF_BE_VBD_DESTROY = 5 +CMSG_BLKIF_BE_VBD_GROW = 6 +CMSG_BLKIF_BE_VBD_SHRINK = 7 + +pendmsg = None +pendaddr = None + +def backend_tx_req(msg): + port = xend.main.dom0_port + if port.space_to_write_request(): + port.write_request(msg) + port.notify() + else: + xend.blkif.pendmsg = msg + +def backend_rx_req(port, msg): + port.write_response(msg) + +def backend_rx_rsp(port, msg): + subtype = (msg.get_header())['subtype'] + print "Received blkif-be response, subtype %d" % subtype + if subtype == CMSG_BLKIF_BE_CREATE: + rsp = { 'success': True } + xend.main.send_management_response(rsp, xend.blkif.pendaddr) + elif subtype == CMSG_BLKIF_BE_CONNECT: + (dom,hnd,evtchn,frame,st) = struct.unpack("QIILI", msg.get_payload()) + blkif = interface.list[xend.main.port_from_dom(dom).local_port] + msg = xend.utils.message(CMSG_BLKIF_FE, \ + CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED, 0) + msg.append_payload(struct.pack("III",0,2,blkif.evtchn['port2'])) + blkif.ctrlif_tx_req(xend.main.port_list[blkif.key], msg) + elif subtype == CMSG_BLKIF_BE_VBD_CREATE: + (dom,hnd,vdev,ro,st) = struct.unpack("QIHII", msg.get_payload()) + blkif = interface.list[xend.main.port_from_dom(dom).local_port] + (pdev, start_sect, nr_sect, readonly) = blkif.devices[vdev] + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_VBD_GROW, 0) + msg.append_payload(struct.pack("QIHHHQQI",dom,0,vdev,0, \ + pdev,start_sect,nr_sect,0)) + backend_tx_req(msg) + elif subtype == CMSG_BLKIF_BE_VBD_GROW: + rsp = { 'success': True } + xend.main.send_management_response(rsp, xend.blkif.pendaddr) + +def backend_do_work(port): + global pendmsg + if pendmsg and port.space_to_write_request(): + port.write_request(pendmsg) + pendmsg = None + return True + return False + + +class interface: + + # Dictionary of all block-device interfaces. + list = {} + + + # NB. 'key' is an opaque value that has no meaning in this class. + def __init__(self, dom, key): + self.dom = dom + self.key = key + self.devices = {} + self.pendmsg = None + interface.list[key] = self + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_CREATE, 0) + msg.append_payload(struct.pack("QII",dom,0,0)) + xend.blkif.pendaddr = xend.main.mgmt_req_addr + backend_tx_req(msg) + + # Attach a device to the specified interface + def attach_device(self, vdev, pdev, start_sect, nr_sect, readonly): + if self.devices.has_key(vdev): + return False + self.devices[vdev] = (pdev, start_sect, nr_sect, readonly) + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_VBD_CREATE, 0) + msg.append_payload(struct.pack("QIHII",self.dom,0,vdev,readonly,0)) + xend.blkif.pendaddr = xend.main.mgmt_req_addr + backend_tx_req(msg) + return True + + + # Completely destroy this interface. + def destroy(self): + del interface.list[self.key] + msg = xend.utils.message(CMSG_BLKIF_BE, CMSG_BLKIF_BE_DESTROY, 0) + msg.append_payload(struct.pack("QII",self.dom,0,0)) + backend_tx_req(msg) + + + # The parameter @port is the control-interface event channel. This method + # returns True if messages were written to the control interface. + def ctrlif_transmit_work(self, port): + if self.pendmsg and port.space_to_write_request(): + port.write_request(self.pendmsg) + self.pendmsg = None + return True + return False + + def ctrlif_tx_req(self, port, msg): + if port.space_to_write_request(): + port.write_request(msg) + port.notify() + else: + self.pendmsg = msg + + def ctrlif_rx_req(self, port, msg): + port.write_response(msg) + subtype = (msg.get_header())['subtype'] + if subtype == CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED: + msg = xend.utils.message(CMSG_BLKIF_FE, \ + CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED, 0) + msg.append_payload(struct.pack("III",0,1,0)) + self.ctrlif_tx_req(port, msg) + elif subtype == CMSG_BLKIF_FE_INTERFACE_CONNECT: + (hnd,frame) = struct.unpack("IL", msg.get_payload()) + xc = Xc.new() + self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom) + msg = xend.utils.message(CMSG_BLKIF_BE, \ + CMSG_BLKIF_BE_CONNECT, 0) + msg.append_payload(struct.pack("QIILI",self.dom,0, \ + self.evtchn['port1'],frame,0)) + backend_tx_req(msg) diff --git a/tools/xend/lib/console.py b/tools/xend/lib/console.py index aad6069979..57898817f5 100644 --- a/tools/xend/lib/console.py +++ b/tools/xend/lib/console.py @@ -5,7 +5,7 @@ ############################################################# import errno, re, os, select, signal, socket, struct, sys - +import xend.blkif, xend.main, xend.manager, xend.utils, Xc ## ## interface: @@ -16,7 +16,7 @@ import errno, re, os, select, signal, socket, struct, sys ## CONNECTED: sending/receiving console data on TCP port 'self.port' ## ## A dictionary of all active interfaces, indexed by TCP socket descriptor, -## is accessible as 'interface.interface_list'. +## is accessible as 'interface.list_by_fd'. ## ## NB. When a class instance is to be destroyed you *must* call the 'close' ## method. Otherwise a stale reference will eb left in the interface list. @@ -30,7 +30,11 @@ class interface: # Dictionary of all active (non-closed) console interfaces. - interface_list = {} + list_by_fd = {} + + + # Dictionary of all console interfaces, closed and open. + list = {} # NB. 'key' is an opaque value that has no meaning in this class. @@ -38,6 +42,9 @@ class interface: self.status = interface.CLOSED self.port = port self.key = key + self.rbuf = xend.utils.buffer() + self.wbuf = xend.utils.buffer() + interface.list[key] = self # Is this interface closed (inactive)? @@ -58,14 +65,14 @@ class interface: # Close the interface, if it is not closed already. def close(self): if not self.closed(): - del interface.interface_list[self.sock.fileno()] + del interface.list_by_fd[self.sock.fileno()] self.sock.close() del self.sock self.status = interface.CLOSED # Move the interface into the 'listening' state. Opens a new listening - # socket and updates 'interface_list'. + # socket and updates 'list_by_fd'. def listen(self): # Close old socket (if any), and create a fresh one. self.close() @@ -80,7 +87,7 @@ class interface: # Announce the new status of thsi interface. self.status = interface.LISTENING - interface.interface_list[self.sock.fileno()] = self + interface.list_by_fd[self.sock.fileno()] = self except: # In case of trouble ensure we get rid of dangling socket reference @@ -105,7 +112,69 @@ class interface: # Publish the new socket and the new interface state. self.sock = sock self.status = interface.CONNECTED - interface.interface_list[self.sock.fileno()] = self + interface.list_by_fd[self.sock.fileno()] = self return 1 + # Completely sestroy a console interface. + def destroy(self): + self.close() + del interface.list[self.key] + + + # Do work triggered by resource availability on a console-interface socket. + def socket_work(self): + # If the interface is listening, check for pending connections. + if self.listening(): + self.connect() + + # All done if the interface is not connected. + if not self.connected(): + return + + # Send as much pending data as possible via the socket. + while not self.rbuf.empty(): + try: + bytes = self.sock.send(self.rbuf.peek()) + if bytes > 0: + self.rbuf.discard(bytes) + except socket.error, error: + pass + + # Read as much data as is available. Don't worry about + # overflowing our buffer: it's more important to read the + # incoming data stream and detect errors or closure of the + # remote end in a timely manner. + try: + while 1: + data = self.sock.recv(2048) + # Return of zero means the remote end has disconnected. + # We therefore return the console interface to listening. + if not data: + self.listen() + break + self.wbuf.write(data) + except socket.error, error: + # Assume that most errors mean that the connection is dead. + # In such cases we return the interface to 'listening' state. + if error[0] != errno.EAGAIN: + print "Better return to listening" + self.listen() + print "New status: " + str(self.status) + + + # The parameter @port is the control-interface event channel. This method + # returns True if messages were written to the control interface. + def ctrlif_transmit_work(self, port): + work_done = False + while not self.wbuf.empty() and port.space_to_write_request(): + msg = xend.utils.message(0, 0, 0) + msg.append_payload(self.wbuf.read(msg.MAX_PAYLOAD)) + port.write_request(msg) + work_done = True + return work_done + + + def ctrlif_rx_req(self, port, msg): + self.rbuf.write(msg.get_payload()) + port.write_response(msg) diff --git a/tools/xend/lib/domain_controller.h b/tools/xend/lib/domain_controller.h index 14f970dd04..68d4fac1d2 100644 --- a/tools/xend/lib/domain_controller.h +++ b/tools/xend/lib/domain_controller.h @@ -56,29 +56,113 @@ typedef struct { #define CMSG_BLKIF_BE 1 /* Block-device backend */ #define CMSG_BLKIF_FE 2 /* Block-device frontend */ + +/****************************************************************************** + * CONSOLE DEFINITIONS + */ + /* * Subtypes for console messages. */ #define CMSG_CONSOLE_DATA 0 + +/****************************************************************************** + * BLOCK-INTERFACE FRONTEND DEFINITIONS + */ + +/* Messages from domain controller to guest. */ +#define CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED 0 + +/* Messages from guest to domain controller. */ +#define CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED 32 +#define CMSG_BLKIF_FE_INTERFACE_CONNECT 33 +#define CMSG_BLKIF_FE_INTERFACE_DISCONNECT 34 + +/* These are used by both front-end and back-end drivers. */ +#define blkif_vdev_t u16 +#define blkif_pdev_t u16 +#define blkif_sector_t u64 + +/* + * CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED: + * Notify a guest about a status change on one of its block interfaces. + * If the interface is DESTROYED or DOWN then the interface is disconnected: + * 1. The shared-memory frame is available for reuse. + * 2. Any unacknowledged messgaes pending on the interface were dropped. + */ +#define BLKIF_INTERFACE_STATUS_DESTROYED 0 /* Interface doesn't exist. */ +#define BLKIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */ +#define BLKIF_INTERFACE_STATUS_CONNECTED 2 /* Exists and is connected. */ +typedef struct { + unsigned int handle; + unsigned int status; + unsigned int evtchn; /* status == BLKIF_INTERFACE_STATUS_CONNECTED */ +} blkif_fe_interface_status_changed_t; + /* - * Subtypes for block-device messages. + * CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED: + * Notify the domain controller that the front-end driver is DOWN or UP. + * When the driver goes DOWN then the controller will send no more + * status-change notifications. When the driver comes UP then the controller + * will send a notification for each interface that currently exists. + * If the driver goes DOWN while interfaces are still UP, the domain + * will automatically take the interfaces DOWN. */ +#define BLKIF_DRIVER_STATUS_DOWN 0 +#define BLKIF_DRIVER_STATUS_UP 1 +typedef struct { + unsigned int status; /* BLKIF_DRIVER_STATUS_??? */ +} blkif_fe_driver_status_changed_t; + +/* + * CMSG_BLKIF_FE_INTERFACE_CONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_CONNECTED message. + */ +typedef struct { + unsigned int handle; + unsigned long shmem_frame; +} blkif_fe_interface_connect_t; + +/* + * CMSG_BLKIF_FE_INTERFACE_DISCONNECT: + * If successful, the domain controller will acknowledge with a + * STATUS_DISCONNECTED message. + */ +typedef struct { + /* IN */ + unsigned int handle; + /* OUT */ + /* + * Tells driver how many interfaces it should expect to immediately + * receive notifications about. + */ + unsigned int nr_interfaces; +} blkif_fe_interface_disconnect_t; + + +/****************************************************************************** + * BLOCK-INTERFACE BACKEND DEFINITIONS + */ + +/* Messages from domain controller. */ #define CMSG_BLKIF_BE_CREATE 0 /* Create a new block-device interface. */ #define CMSG_BLKIF_BE_DESTROY 1 /* Destroy a block-device interface. */ -#define CMSG_BLKIF_BE_VBD_CREATE 2 /* Create a new VBD for an interface. */ -#define CMSG_BLKIF_BE_VBD_DESTROY 3 /* Delete a VBD from an interface. */ -#define CMSG_BLKIF_BE_VBD_GROW 4 /* Append an extent to a given VBD. */ -#define CMSG_BLKIF_BE_VBD_SHRINK 5 /* Remove last extent from a given VBD. */ +#define CMSG_BLKIF_BE_CONNECT 2 /* Connect i/f to remote driver. */ +#define CMSG_BLKIF_BE_DISCONNECT 3 /* Disconnect i/f from remote driver. */ +#define CMSG_BLKIF_BE_VBD_CREATE 4 /* Create a new VBD for an interface. */ +#define CMSG_BLKIF_BE_VBD_DESTROY 5 /* Delete a VBD from an interface. */ +#define CMSG_BLKIF_BE_VBD_GROW 6 /* Append an extent to a given VBD. */ +#define CMSG_BLKIF_BE_VBD_SHRINK 7 /* Remove last extent from a given VBD. */ + +/* Messages to domain controller. */ +#define CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED 32 /* - * Message request/response defintions for block-device messages. + * Message request/response definitions for block-device messages. */ -#define blkif_vdev_t u16 -#define blkif_pdev_t u16 -#define blkif_sector_t u64 - typedef struct { blkif_pdev_t device; blkif_sector_t sector_start; @@ -86,41 +170,91 @@ typedef struct { } blkif_extent_t; /* Non-specific 'okay' return. */ -#define BLKIF_STATUS_OKAY 0 +#define BLKIF_BE_STATUS_OKAY 0 /* Non-specific 'error' return. */ -#define BLKIF_STATUS_ERROR 1 +#define BLKIF_BE_STATUS_ERROR 1 /* The following are specific error returns. */ -#define BLKIF_STATUS_INTERFACE_EXISTS 2 -#define BLKIF_STATUS_INTERFACE_NOT_FOUND 3 +#define BLKIF_BE_STATUS_INTERFACE_EXISTS 2 +#define BLKIF_BE_STATUS_INTERFACE_NOT_FOUND 3 +#define BLKIF_BE_STATUS_INTERFACE_CONNECTED 4 +#define BLKIF_BE_STATUS_VBD_EXISTS 5 +#define BLKIF_BE_STATUS_VBD_NOT_FOUND 6 +#define BLKIF_BE_STATUS_OUT_OF_MEMORY 7 +#define BLKIF_BE_STATUS_EXTENT_NOT_FOUND 8 +#define BLKIF_BE_STATUS_MAPPING_ERROR 9 /* This macro can be used to create an array of descriptive error strings. */ -#define BLKIF_STATUS_ERRORS { \ - "Okay", \ - "Non-specific error", \ - "Interface already exists", \ - "Interface not found" } +#define BLKIF_BE_STATUS_ERRORS { \ + "Okay", \ + "Non-specific error", \ + "Interface already exists", \ + "Interface not found", \ + "Interface is still connected", \ + "VBD already exists", \ + "VBD not found", \ + "Out of memory", \ + "Extent not found for VBD", \ + "Could not map domain memory" } -/* CMSG_BLKIF_CREATE */ +/* + * CMSG_BLKIF_BE_CREATE: + * When the driver sends a successful response then the interface is fully + * created. The controller will send a DOWN notification to the front-end + * driver. + */ typedef struct { /* IN */ domid_t domid; /* Domain attached to new interface. */ unsigned int blkif_handle; /* Domain-specific interface handle. */ - unsigned int evtchn; /* Event channel for notifications. */ - unsigned long shmem_frame; /* Page cont. shared comms window. */ /* OUT */ unsigned int status; -} blkif_create_t; +} blkif_be_create_t; -/* CMSG_BLKIF_DESTROY */ +/* + * CMSG_BLKIF_BE_DESTROY: + * When the driver sends a successful response then the interface is fully + * torn down. The controller will send a DESTROYED notification to the + * front-end driver. + */ typedef struct { /* IN */ domid_t domid; /* Identify interface to be destroyed. */ unsigned int blkif_handle; /* ...ditto... */ /* OUT */ unsigned int status; -} blkif_destroy_t; +} blkif_be_destroy_t; -/* CMSG_BLKIF_VBD_CREATE */ +/* + * CMSG_BLKIF_BE_CONNECT: + * When the driver sends a successful response then the interface is fully + * connected. The controller will send a CONNECTED notification to the + * front-end driver. + */ +typedef struct { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + unsigned int blkif_handle; /* Domain-specific interface handle. */ + unsigned int evtchn; /* Event channel for notifications. */ + unsigned long shmem_frame; /* Page cont. shared comms window. */ + /* OUT */ + unsigned int status; +} blkif_be_connect_t; + +/* + * CMSG_BLKIF_BE_DISCONNECT: + * When the driver sends a successful response then the interface is fully + * disconnected. The controller will send a DOWN notification to the front-end + * driver. + */ +typedef struct { + /* IN */ + domid_t domid; /* Domain attached to new interface. */ + unsigned int blkif_handle; /* Domain-specific interface handle. */ + /* OUT */ + unsigned int status; +} blkif_be_disconnect_t; + +/* CMSG_BLKIF_BE_VBD_CREATE */ typedef struct { /* IN */ domid_t domid; /* Identify blkdev interface. */ @@ -129,9 +263,9 @@ typedef struct { int readonly; /* Non-zero -> VBD isn't writeable. */ /* OUT */ unsigned int status; -} blkif_vbd_create_t; +} blkif_be_vbd_create_t; -/* CMSG_BLKIF_VBD_DESTROY */ +/* CMSG_BLKIF_BE_VBD_DESTROY */ typedef struct { /* IN */ domid_t domid; /* Identify blkdev interface. */ @@ -139,9 +273,9 @@ typedef struct { blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ /* OUT */ unsigned int status; -} blkif_vbd_destroy_t; +} blkif_be_vbd_destroy_t; -/* CMSG_BLKIF_VBD_GROW */ +/* CMSG_BLKIF_BE_VBD_GROW */ typedef struct { /* IN */ domid_t domid; /* Identify blkdev interface. */ @@ -150,9 +284,9 @@ typedef struct { blkif_extent_t extent; /* Physical extent to append to VBD. */ /* OUT */ unsigned int status; -} blkif_vbd_grow_t; +} blkif_be_vbd_grow_t; -/* CMSG_BLKIF_VBD_SHRINK */ +/* CMSG_BLKIF_BE_VBD_SHRINK */ typedef struct { /* IN */ domid_t domid; /* Identify blkdev interface. */ @@ -160,6 +294,23 @@ typedef struct { blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ /* OUT */ unsigned int status; -} blkif_vbd_shrink_t; +} blkif_be_vbd_shrink_t; + +/* + * CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED: + * Notify the domain controller that the back-end driver is DOWN or UP. + * If the driver goes DOWN while interfaces are still UP, the domain + * will automatically send DOWN notifications. + */ +typedef struct { + /* IN */ + unsigned int status; /* BLKIF_DRIVER_STATUS_??? */ + /* OUT */ + /* + * Tells driver how many interfaces it should expect to immediately + * receive notifications about. + */ + unsigned int nr_interfaces; +} blkif_be_driver_status_changed_t; #endif /* __DOMAIN_CONTROLLER_H__ */ diff --git a/tools/xend/lib/main.py b/tools/xend/lib/main.py index 4b243b3307..7b5adbab83 100755 --- a/tools/xend/lib/main.py +++ b/tools/xend/lib/main.py @@ -5,7 +5,7 @@ ########################################################### import errno, re, os, pwd, select, signal, socket, struct, sys, time -import xend.console, xend.manager, xend.utils, Xc +import xend.blkif, xend.console, xend.manager, xend.utils, Xc # The following parameters could be placed in a configuration file. @@ -16,13 +16,35 @@ CONTROL_DIR = '/var/run/xend' UNIX_SOCK = 'management_sock' # relative to CONTROL_DIR +CMSG_CONSOLE = 0 +CMSG_BLKIF_BE = 1 +CMSG_BLKIF_FE = 2 + + +def port_from_dom(dom): + global port_list + for idx, port in port_list.items(): + if port.remote_dom == dom: + return port + return None + + +def send_management_response(response, addr): + try: + response = str(response) + print "Mgmt_rsp[%s]: %s" % (addr, response) + management_interface.sendto(response, addr) + except socket.error, error: + pass + + def daemon_loop(): # Could we do this more nicely? The xend.manager functions need access # to this global state to do their work. - global control_list, notifier + global port_list, notifier, management_interface, mgmt_req_addr, dom0_port - # List of all control interfaces, indexed by local event-channel port. - control_list = {} + # Lists of all interfaces, indexed by local event-channel port. + port_list = {} xc = Xc.new() @@ -44,6 +66,13 @@ def daemon_loop(): # notifications. notifier = xend.utils.notifier() + # The DOM0 control interface is not set up via the management interface. + # Note that console messages don't come our way (actually, only driver + # back-ends should use the DOM0 control interface). + dom0_port = xend.utils.port(0) + notifier.bind(dom0_port.local_port) + port_list[dom0_port.local_port] = dom0_port + ## ## MAIN LOOP ## @@ -58,10 +87,10 @@ def daemon_loop(): waitset = select.poll() waitset.register(management_interface, select.POLLIN) waitset.register(notifier, select.POLLIN) - for idx, (port, rbuf, wbuf, con_if) in control_list.items(): + for idx, con_if in xend.console.interface.list_by_fd.items(): if not con_if.closed(): pflags = select.POLLIN - if not rbuf.empty() and con_if.connected(): + if not con_if.rbuf.empty() and con_if.connected(): pflags = select.POLLIN | select.POLLOUT waitset.register(con_if.sock.fileno(), pflags) @@ -72,16 +101,16 @@ def daemon_loop(): # These should consist of executable Python statements that call # well-known management functions (e.g., new_control_interface(dom=9)). try: - data, addr = management_interface.recvfrom(2048) + data, mgmt_req_addr = management_interface.recvfrom(2048) except socket.error, error: if error[0] != errno.EAGAIN: raise else: - if addr: + if mgmt_req_addr: # Evaluate the request in an exception-trapping sandbox. try: - print "Mgmt_req[%s]: %s" % (addr, data) - response = str(eval('xend.manager.'+data)) + print "Mgmt_req[%s]: %s" % (mgmt_req_addr, data) + response = eval('xend.manager.'+data) except: # Catch all exceptions and turn into an error response: @@ -97,69 +126,20 @@ def daemon_loop(): response = str(response) # Try to send a response to the requester. - try: - print "Mgmt_rsp[%s]: %s" % (addr, response) - management_interface.sendto(response, addr) - except socket.error, error: - pass + if response: + send_management_response(response, mgmt_req_addr) # Do work for every console interface that hit in the poll set. for (fd, events) in fdset: - if not xend.console.interface.interface_list.has_key(fd): - continue - con_if = xend.console.interface.interface_list[fd] - - # If the interface is listening, check for pending connections. - if con_if.listening(): - con_if.connect() - - # All done if the interface is not connected. - if not con_if.connected(): - continue - (port, rbuf, wbuf, con_if) = control_list[con_if.key] - - # Send as much pending data as possible via the socket. - while not rbuf.empty(): - try: - bytes = con_if.sock.send(rbuf.peek()) - if bytes > 0: - rbuf.discard(bytes) - except socket.error, error: - pass - - # Read as much data as is available. Don't worry about - # overflowing our buffer: it's more important to read the - # incoming data stream and detect errors or closure of the - # remote end in a timely manner. - try: - while 1: - data = con_if.sock.recv(2048) - # Return of zero means the remote end has disconnected. - # We therefore return the console interface to listening. - if not data: - con_if.listen() - break - wbuf.write(data) - except socket.error, error: - # Assume that most errors mean that the connection is dead. - # In such cases we return the interface to 'listening' state. - if error[0] != errno.EAGAIN: - print "Better return to listening" - con_if.listen() - print "New status: " + str(con_if.status) - - # We may now have pending data to send via the relevant - # inter-domain control interface. If so then we send all we can - # and notify the remote end. - work_done = False - while not wbuf.empty() and port.space_to_write_request(): - msg = xend.utils.message(0, 0, 0) - msg.append_payload(wbuf.read(msg.MAX_PAYLOAD)) - port.write_request(msg) - work_done = True - if work_done: - port.notify() - + if xend.console.interface.list_by_fd.has_key(fd): + con_if = xend.console.interface.list_by_fd[fd] + con_if.socket_work() + # We may now have pending data to send via the control + # interface. If so then send all we can and notify the remote. + port = port_list[con_if.key] + if con_if.ctrlif_transmit_work(port): + port.notify() + # Process control-interface notifications from other guest OSes. while 1: # Grab a notification, if there is one. @@ -168,42 +148,69 @@ def daemon_loop(): break (idx, type) = notification - if not control_list.has_key(idx): + if not port_list.has_key(idx): continue - (port, rbuf, wbuf, con_if) = control_list[idx] + port = port_list[idx] work_done = False + con_if = False + if xend.console.interface.list.has_key(idx): + con_if = xend.console.interface.list[idx] + + blk_if = False + if xend.blkif.interface.list.has_key(idx): + blk_if = xend.blkif.interface.list[idx] + # If we pick up a disconnect notification then we do any necessary # cleanup. if type == notifier.EXCEPTION: ret = xc.evtchn_status(idx) if ret['status'] == 'unbound': notifier.unbind(idx) - con_if.close() - del control_list[idx], port, rbuf, wbuf, con_if + del port_list[idx], port + if con_if: + con_if.destroy() + del con_if + if blk_if: + blk_if.destroy() + del blk_if continue - # Read incoming requests. Currently assume that request - # message always containb console data. + # Process incoming requests. while port.request_to_read(): msg = port.read_request() - rbuf.write(msg.get_payload()) - port.write_response(msg) work_done = True - - # Incoming responses are currently thrown on the floor. + type = (msg.get_header())['type'] + if type == CMSG_CONSOLE and con_if: + con_if.ctrlif_rx_req(port, msg) + elif type == CMSG_BLKIF_FE and blk_if: + blk_if.ctrlif_rx_req(port, msg) + elif type == CMSG_BLKIF_BE and port == dom0_port: + xend.blkif.backend_rx_req(port, msg) + else: + port.write_response(msg) + + # Process incoming responses. while port.response_to_read(): msg = port.read_response() work_done = True + type = (msg.get_header())['type'] + if type == CMSG_BLKIF_BE and port == dom0_port: + xend.blkif.backend_rx_rsp(port, msg) - # Send as much pending console data as there is room for. - while not wbuf.empty() and port.space_to_write_request(): - msg = xend.utils.message(0, 0, 0) - msg.append_payload(wbuf.read(msg.MAX_PAYLOAD)) - port.write_request(msg) + # Send console data. + if con_if and con_if.ctrlif_transmit_work(port): work_done = True + # Send blkif messages. + if blk_if and blk_if.ctrlif_transmit_work(port): + work_done = True + + # Back-end block-device work. + if port == dom0_port and xend.blkif.backend_do_work(port): + work_done = True + # Finally, notify the remote end of any work that we did. if work_done: port.notify() diff --git a/tools/xend/lib/manager.py b/tools/xend/lib/manager.py index 42d66d3a95..ea7398cd4c 100644 --- a/tools/xend/lib/manager.py +++ b/tools/xend/lib/manager.py @@ -4,13 +4,13 @@ ## Copyright (c) 2004, K A Fraser (University of Cambridge) ############################################################# -import xend.console, xend.main, xend.utils +import xend.blkif, xend.console, xend.main, xend.utils ## ## new_control_interface: -## Create a new control interface with the specified domain 'dom'. -## The console port may also be specified; otehrwise a suitable port is +## Create a new control interface with the specified domain @dom. +## The console port may also be specified; otherwise a suitable port is ## automatically allocated. ## def new_control_interface(dom, console_port=-1): @@ -26,9 +26,8 @@ def new_control_interface(dom, console_port=-1): con_if = xend.console.interface(console_port, port.local_port) con_if.listen() - # Add control state to the master list. - xend.main.control_list[port.local_port] = \ - (port, xend.utils.buffer(), xend.utils.buffer(), con_if) + # Update the master port list. + xend.main.port_list[port.local_port] = port # Construct the successful response to be returned to the requester. response = { 'success': True } @@ -36,3 +35,81 @@ def new_control_interface(dom, console_port=-1): response['remote_port'] = port.remote_port response['console_port'] = console_port return response + + +## +## new_block_interface: +## Create a new block interface for the specified domain @dom. +## +def new_block_interface(dom, handle=-1): + # By default we create an interface with handle zero. + if handle < 0: + handle = 0 + + # We only support one interface per domain, which must have handle zero. + if handle != 0: + response = { 'success': False } + response['error_type'] = 'Bad handle %d (only handle 0 ' + \ + 'is supported)' % handle + return response + + # Find local event-channel port associated with the specified domain. + port = xend.main.port_from_dom(dom) + if not port: + response = { 'success': False } + response['error_type'] = 'Unknown domain %d' % dom + return response + + # The interface must not already exist. + if xend.blkif.interface.list.has_key(port.local_port): + response = { 'success': False } + response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \ + 'exists' % (dom, handle) + return response + + # Create the new interface. Initially no virtual devices are attached. + xend.blkif.interface(dom, port.local_port) + + # Response is deferred until back-end driver sends acknowledgement. + return None + + +## +## new_block_device: +## Attach a new virtual block device to the specified block interface +## (@dom, @handle). The new device is identified by @vdev, and maps to +## the real block extent (@pdev, @start_sect, @nr_sect). If @readonly then +## write requests to @vdev will be rejected. +## +def new_block_device(dom, handle, vdev, pdev, start_sect, nr_sect, readonly): + # We only support one interface per domain, which must have handle zero. + if handle != 0: + response = { 'success': False } + response['error_type'] = 'Bad handle %d (only handle 0 ' + \ + 'is supported)' % handle + return response + + # Find local event-channel port associated with the specified domain. + port = xend.main.port_from_dom(dom) + if not port: + response = { 'success': False } + response['error_type'] = 'Unknown domain %d' % dom + return response + + # The interface must exist. + if not xend.blkif.interface.list.has_key(port.local_port): + response = { 'success': False } + response['error_type'] = 'Interface (dom=%d,handle=%d) does not ' + \ + 'exists' % (dom, handle) + return response + + # The virtual device must not yet exist. + blkif = xend.blkif.interface.list[port.local_port] + if not blkif.attach_device(vdev, pdev, start_sect, nr_sect, readonly): + response = { 'success': False } + response['error_type'] = 'Vdevice (dom=%d,handle=%d,vdevice=%d) ' + \ + 'already exists' % (dom, handle, vdev) + return response + + # Response is deferred until back-end driver sends acknowledgement. + return None diff --git a/tools/xend/lib/utils.c b/tools/xend/lib/utils.c index 4883ec1a46..c28d682ec9 100644 --- a/tools/xend/lib/utils.c +++ b/tools/xend/lib/utils.c @@ -22,6 +22,8 @@ #include <signal.h> #include <xc.h> +#include <asm-xen/proc_cmd.h> + #include <hypervisor-if.h> #include "domain_controller.h" @@ -684,8 +686,23 @@ static PyObject *xu_port_new(PyObject *self, PyObject *args) goto fail2; } - if ( xc_evtchn_bind_interdomain(xup->xc_handle, - DOMID_SELF, dom, &port1, &port2) != 0 ) + if ( dom == 0ULL ) + { + /* + * The control-interface event channel for DOM0 is already set up. + * We use an ioctl to discover the port at our end of the channel. + */ + port1 = ioctl(xup->xc_handle, IOCTL_PRIVCMD_INITDOMAIN_EVTCHN, NULL); + port2 = -1; /* We don't need the remote end of the DOM0 link. */ + if ( port1 < 0 ) + { + PyErr_SetString(port_error, "Could not open channel to DOM0"); + goto fail3; + } + } + else if ( xc_evtchn_bind_interdomain(xup->xc_handle, + DOMID_SELF, dom, + &port1, &port2) != 0 ) { PyErr_SetString(port_error, "Could not open channel to domain"); goto fail3; @@ -744,7 +761,8 @@ static void xu_port_dealloc(PyObject *self) { xu_port_object *xup = (xu_port_object *)self; unmap_control_interface(xup->mem_fd, xup->interface); - (void)xc_evtchn_close(xup->xc_handle, DOMID_SELF, xup->local_port); + if ( xup->remote_dom != 0ULL ) + (void)xc_evtchn_close(xup->xc_handle, DOMID_SELF, xup->local_port); (void)xc_interface_close(xup->xc_handle); (void)close(xup->mem_fd); PyObject_Del(self); diff --git a/tools/xend/setup.py b/tools/xend/setup.py index 1f39cb4572..5567d7093c 100644 --- a/tools/xend/setup.py +++ b/tools/xend/setup.py @@ -4,7 +4,8 @@ from distutils.core import setup, Extension utils = Extension("utils", extra_compile_args = ["-fno-strict-aliasing"], include_dirs = ["../xc/lib", - "../../xen/include/hypervisor-ifs"], + "../../xen/include/hypervisor-ifs", + "../../xenolinux-sparse/include"], library_dirs = ["../xc/lib"], libraries = ["xc"], sources = ["lib/utils.c"]) diff --git a/xen/arch/i386/pdb-stub.c b/xen/arch/i386/pdb-stub.c index 049f330cf6..5b42e9a746 100644 --- a/xen/arch/i386/pdb-stub.c +++ b/xen/arch/i386/pdb-stub.c @@ -51,6 +51,8 @@ static unsigned char pdb_xmit_checksum; unsigned long pdb_linux_pid_ptbr (unsigned long cr3, int pid); void pdb_linux_get_values(char *buffer, int length, unsigned long address, int pid, unsigned long cr3); +void pdb_linux_set_values(char *buffer, int length, unsigned long address, + int pid, unsigned long cr3); struct pdb_context { @@ -571,6 +573,12 @@ pdb_process_command (char *ptr, struct pt_regs *regs, unsigned long cr3, { hex2mem (ptr, (char *)addr, length); } + else if (pdb_ctx.process != -1) + { + pdb_linux_set_values(ptr, length, addr, + pdb_ctx.process, + pdb_ctx.ptbr); + } else { pdb_set_values (ptr, length, diff --git a/xen/common/debug-linux.c b/xen/common/debug-linux.c index 4fbcdf2918..ff767b51cd 100644 --- a/xen/common/debug-linux.c +++ b/xen/common/debug-linux.c @@ -171,6 +171,44 @@ void pdb_linux_get_values(char *buffer, int length, unsigned long address, } } + +void pdb_linux_set_value(int pid, unsigned long cr3, unsigned long addr, + u_char *value) +{ + unsigned long pgd; + unsigned long l2tab, page; + + /* get the process' pgd */ + pgd = pdb_linux_pid_ptbr(cr3, pid); + + /* get the l2 table entry */ + pdb_get_values((u_char *) &l2tab, sizeof(l2tab), + cr3, pgd + (addr >> PGDIR_SHIFT) * 4); + l2tab = (unsigned long)__va(machine_to_phys(cr3, l2tab) & PAGE_MASK); + + /* get the page table entry */ + pdb_get_values((u_char *) &page, sizeof(page), + cr3, l2tab + ((addr & L1_PAGE_BITS) >> PAGE_SHIFT) * 4); + page = (unsigned long)__va(machine_to_phys(cr3, page) & PAGE_MASK); + + /* set the byte */ + pdb_set_values(value, sizeof(u_char), cr3, page + (addr & ~PAGE_MASK)); +} + +void pdb_linux_set_values(char *buffer, int length, unsigned long address, + int pid, unsigned long cr3) +{ + int loop; + + /* it's difficult to imagine a more inefficient algorithm */ + for (loop = 0; loop < length; loop++) + { + pdb_linux_set_value(pid, cr3, address + loop, &buffer[loop * 2]); + } +} + +/**********************************************************************/ + /* * return 1 if is the virtual address is in the operating system's * address space, else 0 diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c index c0bea86320..c6011cebb4 100644 --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -109,15 +109,18 @@ static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) goto out; } + /* 'Allocate' port1 before searching for a free port2. */ + p1->event_channel[port1].state = ECS_INTERDOMAIN; + if ( (port2 = get_free_port(p2)) < 0 ) { + p1->event_channel[port1].state = ECS_FREE; rc = port2; goto out; } p1->event_channel[port1].u.remote.dom = p2; p1->event_channel[port1].u.remote.port = (u16)port2; - p1->event_channel[port1].state = ECS_INTERDOMAIN; p2->event_channel[port2].u.remote.dom = p1; p2->event_channel[port2].u.remote.port = (u16)port1; diff --git a/xen/common/kernel.c b/xen/common/kernel.c index d8d5d03251..7f814391cf 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -73,6 +73,9 @@ unsigned char opt_pdb[10] = "none"; unsigned int opt_tbuf_size = 1; /* opt_sched: scheduler - default to Borrowed Virtual Time */ char opt_sched[10] = "bvt"; +/* opt_physdev_dom0_hide: list of PCI slots to hide from dom0 + * Should have the format '(%02x:%02x.%1x)(%02x:%02x.%1x)...etc' */ +char opt_physdev_dom0_hide[20] = ""; static struct { unsigned char *name; @@ -94,6 +97,7 @@ static struct { { "pdb", OPT_STR, &opt_pdb }, { "tbuf_size", OPT_UINT, &opt_tbuf_size }, { "sched", OPT_STR, &opt_sched }, + { "physdev_dom0_hide",OPT_STR, &opt_physdev_dom0_hide }, { NULL, 0, NULL } }; diff --git a/xen/common/physdev.c b/xen/common/physdev.c index 0d14a31527..0cc2654e8a 100644 --- a/xen/common/physdev.c +++ b/xen/common/physdev.c @@ -115,16 +115,17 @@ static void add_dev_to_task(struct task_struct *p, /* * physdev_pci_access_modify: - * Allow/disallow access to a specific PCI device. Also allow read access to - * PCI devices from the device to the root of the device tree. If the given - * device is a bridge, then the domain should get access to all the devices - * attached to that bridge (XXX this is unimplemented!). + * Allow/disallow access to a specific PCI device. Guests should not be + * allowed to see bridge devices as it needlessly complicates things (one + * possible exception to this is the AGP bridge). If the given device is a + * bridge, then the domain should get access to all the leaf devices below + * that bridge (XXX this is unimplemented!). */ int physdev_pci_access_modify( domid_t dom, int bus, int dev, int func, int enable) { struct task_struct *p; - struct pci_dev *pdev, *rdev, *tdev; + struct pci_dev *pdev; int rc = 0; if ( !IS_PRIV(current) ) @@ -145,7 +146,7 @@ int physdev_pci_access_modify( return -ESRCH; /* Make the domain privileged. */ - set_bit(PF_PRIVILEGED, &p->flags); + set_bit(PF_PRIVILEGED, &p->flags); /* Grant write access to the specified device. */ if ( (pdev = pci_find_slot(bus, PCI_DEVFN(dev, func))) == NULL ) @@ -155,27 +156,10 @@ int physdev_pci_access_modify( goto out; } add_dev_to_task(p, pdev, ACC_WRITE); + INFO(" add RW %02x:%02x:%02x\n", pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); - /* Grant read access to the root device. */ - if ( (rdev = pci_find_slot(0, PCI_DEVFN(0, 0))) == NULL ) - { - INFO(" bizarre -- no PCI root dev\n"); - rc = -ENODEV; - goto out; - } - add_dev_to_task(p, rdev, ACC_READ); - INFO(" add R0 %02x:%02x:%02x\n", 0, 0, 0); - - /* Grant read access to all devices on the path to the root. */ - for ( tdev = pdev->bus->self; tdev != NULL; tdev = tdev->bus->self ) - { - add_dev_to_task(p, tdev, ACC_READ); - INFO(" add RO %02x:%02x:%02x\n", tdev->bus->number, - PCI_SLOT(tdev->devfn), PCI_FUNC(tdev->devfn)); - } - /* Is the device a bridge or cardbus? */ if ( pdev->hdr_type != PCI_HEADER_TYPE_NORMAL ) INFO("XXX can't give access to bridge devices yet\n"); @@ -256,8 +240,16 @@ static int do_base_address_access(phys_dev_t *pdev, int acc, int idx, if ( len != sizeof(u32) ) { - INFO("Guest attempting sub-dword %s to BASE_ADDRESS %d\n", + /* This isn't illegal, but there doesn't seem to be a very good reason + * to do it for normal devices (bridges are another matter). Since it + * would complicate the code below, we don't support this for now. */ + + /* We could set *val to some value but the guest may well be in trouble + * anyway if this write fails. Hopefully the printk will give us a + * clue what went wrong. */ + printk("Guest attempting sub-dword %s to BASE_ADDRESS %d\n", (acc == ACC_READ) ? "read" : "write", idx); + return -EPERM; } @@ -420,7 +412,13 @@ static long pci_cfgreg_read(int bus, int dev, int func, int reg, phys_dev_t *pdev; if ( (ret = check_dev_acc(current, bus, dev, func, &pdev)) != 0 ) - return ret; + { + /* PCI spec states that reads from non-existent devices should return + * all 1s. In this case the domain has no read access, which should + * also look like the device is non-existent. */ + *val = 0xFFFFFFFF; + return ret; /* KAF: error return seems to matter on my test machine. */ + } /* Fake out read requests for some registers. */ switch ( reg ) @@ -608,6 +606,21 @@ long do_physdev_op(physdev_op_t *uop) return ret; } +/* Test if boot params specify this device should NOT be visible to DOM0 + * (e.g. so that another domain can control it instead) */ +int pcidev_dom0_hidden(struct pci_dev *dev) +{ + extern char opt_physdev_dom0_hide[]; + char cmp[10] = "(.......)"; + + strncpy(&cmp[1], dev->slot_name, 7); + + if ( strstr(opt_physdev_dom0_hide, dev->slot_name) == NULL ) + return 0; + + return 1; +} + /* Domain 0 has read access to all devices. */ void physdev_init_dom0(struct task_struct *p) @@ -619,14 +632,22 @@ void physdev_init_dom0(struct task_struct *p) pci_for_each_dev(dev) { - /* Skip bridges and other peculiarities for now. */ - if ( dev->hdr_type != PCI_HEADER_TYPE_NORMAL ) - continue; - pdev = kmalloc(sizeof(phys_dev_t), GFP_KERNEL); - pdev->dev = dev; - pdev->flags = ACC_WRITE; - pdev->state = 0; - pdev->owner = p; - list_add(&pdev->node, &p->pcidev_list); - } + if ( !pcidev_dom0_hidden(dev) ) + { + /* Skip bridges and other peculiarities for now. */ + if ( dev->hdr_type != PCI_HEADER_TYPE_NORMAL ) + continue; + pdev = kmalloc(sizeof(phys_dev_t), GFP_KERNEL); + pdev->dev = dev; + pdev->flags = ACC_WRITE; + pdev->state = 0; + pdev->owner = p; + list_add(&pdev->node, &p->pcidev_list); + } + else + { + printk("Hiding PCI device %s from DOM0\n", dev->slot_name); + } + } } + diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h index 4895172937..e6004b4a8e 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h @@ -34,23 +34,37 @@ typedef struct blkif_st { unsigned int evtchn; int irq; /* Comms information. */ - blk_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */ + blkif_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */ BLK_RING_IDX blk_req_cons; /* Request consumer. */ BLK_RING_IDX blk_resp_prod; /* Private version of response producer. */ /* VBDs attached to this interface. */ rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */ spinlock_t vbd_lock; /* Protects VBD mapping. */ /* Private fields. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + /* + * DISCONNECT response is deferred until pending requests are ack'ed. + * We therefore need to store the id from the original request. + */ + u8 disconnect_rspid; struct blkif_st *hash_next; struct list_head blkdev_list; spinlock_t blk_ring_lock; + atomic_t refcnt; } blkif_t; -void blkif_create(blkif_create_t *create); -void blkif_destroy(blkif_destroy_t *destroy); +void blkif_create(blkif_be_create_t *create); +void blkif_destroy(blkif_be_destroy_t *destroy); +void blkif_connect(blkif_be_connect_t *connect); +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id); +void __blkif_disconnect_complete(blkif_t *blkif); blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); -void blkif_get(blkif_t *blkif); -void blkif_put(blkif_t *blkif); +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + __blkif_disconnect_complete(_b); \ + } while (0) /* An entry in a list of xen_extents. */ typedef struct _blkif_extent_le { @@ -60,25 +74,25 @@ typedef struct _blkif_extent_le { typedef struct _vbd { blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ - unsigned char mode; /* VBD_MODE_{R,W} */ + unsigned char readonly; /* Non-zero -> read-only */ unsigned char type; /* XD_TYPE_xxx */ blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */ rb_node_t rb; /* for linking into R-B tree lookup struct */ } vbd_t; -long vbd_create(blkif_vbd_create_t *create_params); -long vbd_grow(blkif_vbd_grow_t *grow_params); -long vbd_shrink(blkif_vbd_shrink_t *shrink_params); -long vbd_destroy(blkif_vbd_destroy_t *delete_params); - -void destroy_all_vbds(struct task_struct *p); +void vbd_create(blkif_be_vbd_create_t *create); +void vbd_grow(blkif_be_vbd_grow_t *grow); +void vbd_shrink(blkif_be_vbd_shrink_t *shrink); +void vbd_destroy(blkif_be_vbd_destroy_t *delete); +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds); +void destroy_all_vbds(blkif_t *blkif); typedef struct { blkif_t *blkif; unsigned long id; atomic_t pendcnt; unsigned short operation; - unsigned short status; + int status; } pending_req_t; /* Describes a [partial] disk extent (part of a block io request) */ @@ -91,7 +105,10 @@ typedef struct { int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); -int blkif_be_controller_init(void); +void blkif_interface_init(void); +void blkif_ctrlif_init(void); + +void blkif_deschedule(blkif_t *blkif); void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c index c7ef10c3ba..0746ecfab0 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c @@ -10,37 +10,50 @@ static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) { + DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype); + switch ( msg->subtype ) { case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_create_t) ) + if ( msg->length != sizeof(blkif_be_create_t) ) goto parse_error; - blkif_create((blkif_create_t *)&msg->msg[0]); + blkif_create((blkif_be_create_t *)&msg->msg[0]); break; case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_destroy_t) ) + if ( msg->length != sizeof(blkif_be_destroy_t) ) goto parse_error; - blkif_destroy((blkif_destroy_t *)&msg->msg[0]); + blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_CONNECT: + if ( msg->length != sizeof(blkif_be_connect_t) ) + goto parse_error; + blkif_connect((blkif_be_connect_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DISCONNECT: + if ( msg->length != sizeof(blkif_be_disconnect_t) ) + goto parse_error; + if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) ) + return; /* Sending the response is deferred until later. */ break; case CMSG_BLKIF_BE_VBD_CREATE: - if ( msg->length != sizeof(blkif_vbd_create_t) ) + if ( msg->length != sizeof(blkif_be_vbd_create_t) ) goto parse_error; - vbd_create((blkif_vbd_create_t *)&msg->msg[0]); + vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]); break; case CMSG_BLKIF_BE_VBD_DESTROY: - if ( msg->length != sizeof(blkif_vbd_destroy_t) ) + if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) goto parse_error; - vbd_destroy((blkif_vbd_destroy_t *)&msg->msg[0]); + vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]); break; case CMSG_BLKIF_BE_VBD_GROW: - if ( msg->length != sizeof(blkif_vbd_grow_t) ) + if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) goto parse_error; - vbd_grow((blkif_vbd_grow_t *)&msg->msg[0]); + vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]); break; case CMSG_BLKIF_BE_VBD_SHRINK: - if ( msg->length != sizeof(blkif_vbd_shrink_t) ) + if ( msg->length != sizeof(blkif_be_vbd_shrink_t) ) goto parse_error; - vbd_shrink((blkif_vbd_shrink_t *)&msg->msg[0]); + vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]); break; default: goto parse_error; @@ -50,12 +63,24 @@ static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) return; parse_error: + DPRINTK("Parse error while reading message subtype %d, len %d\n", + msg->subtype, msg->length); msg->length = 0; ctrl_if_send_response(msg); } -int blkif_ctrlif_init(void) +void blkif_ctrlif_init(void) { + ctrl_msg_t cmsg; + blkif_be_driver_status_changed_t st; + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx); - return 0; + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(blkif_be_driver_status_changed_t); + st.status = BLKIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); } diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c index 579795deb9..9acbac35ab 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c @@ -12,85 +12,223 @@ #define BLKIF_HASH(_d,_h) \ (((int)(_d)^(int)((_d)>>32)^(int)(_h))&(BLKIF_HASHSZ-1)) -static blkif_t *blkif_hash[BLKIF_HASHSZ]; +static kmem_cache_t *blkif_cachep; +static blkif_t *blkif_hash[BLKIF_HASHSZ]; blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) { blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; while ( (blkif != NULL) && - (blkif->domid != domid) && - (blkif->handle != handle) ) + ((blkif->domid != domid) || (blkif->handle != handle)) ) blkif = blkif->hash_next; return blkif; } -void blkif_create(blkif_create_t *create) +void __blkif_disconnect_complete(blkif_t *blkif) +{ + ctrl_msg_t cmsg; + blkif_be_disconnect_t disc; + + /* + * These can't be done in __blkif_disconnect() because at that point there + * may be outstanding requests at the disc whose asynchronous responses + * must still be notified to the remote driver. + */ + unbind_evtchn_from_irq(blkif->evtchn); + vfree(blkif->blk_ring_base); + + /* Construct the deferred response message. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; + cmsg.id = blkif->disconnect_rspid; + cmsg.length = sizeof(blkif_be_disconnect_t); + disc.domid = blkif->domid; + disc.blkif_handle = blkif->handle; + disc.status = BLKIF_BE_STATUS_OKAY; + memcpy(cmsg.msg, &disc, sizeof(disc)); + + /* + * Make sure message is constructed /before/ status change, because + * after the status change the 'blkif' structure could be deallocated at + * any time. Also make sure we send the response /after/ status change, + * as otherwise a subsequent CONNECT request could spuriously fail if + * another CPU doesn't see the status change yet. + */ + mb(); + if ( blkif->status != DISCONNECTING ) + BUG(); + blkif->status = DISCONNECTED; + mb(); + + /* Send the successful response. */ + ctrl_if_send_response(&cmsg); +} + +void blkif_create(blkif_be_create_t *create) { domid_t domid = create->domid; unsigned int handle = create->blkif_handle; - unsigned int evtchn = create->evtchn; - unsigned long shmem_frame = create->shmem_frame; blkif_t **pblkif, *blkif; - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( *pblkif == NULL ) + if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL ) { - if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) - goto found_match; - pblkif = &(*pblkif)->hash_next; + DPRINTK("Could not create blkif: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; } - blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->handle = handle; - blkif->evtchn = evtchn; - blkif->irq = bind_evtchn_to_irq(evtchn); - blkif->shmem_frame = shmem_frame; - blkif->shmem_vbase = ioremap(shmem_frame<<PAGE_SHIFT, PAGE_SIZE); + blkif->domid = domid; + blkif->handle = handle; + blkif->status = DISCONNECTED; spin_lock_init(&blkif->vbd_lock); spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 0); - request_irq(irq, blkif_be_int, 0, "blkif-backend", blkif); + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + { + DPRINTK("Could not create blkif: already exists\n"); + create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; + kmem_cache_free(blkif_cachep, blkif); + return; + } + pblkif = &(*pblkif)->hash_next; + } blkif->hash_next = *pblkif; *pblkif = blkif; - create->status = BLKIF_STATUS_OKAY; - return; - - found_match: - create->status = BLKIF_STATUS_INTERFACE_EXISTS; - return; - - evtchn_in_use: - unbind_evtchn_from_irq(evtchn); /* drop refcnt */ - create->status = BLKIF_STATUS_ERROR; - return; + DPRINTK("Successfully created blkif\n"); + create->status = BLKIF_BE_STATUS_OKAY; } -void blkif_destroy(blkif_destroy_t *destroy) +void blkif_destroy(blkif_be_destroy_t *destroy) { domid_t domid = destroy->domid; unsigned int handle = destroy->blkif_handle; blkif_t **pblkif, *blkif; pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif = *pblkif) == NULL ) + while ( (blkif = *pblkif) != NULL ) { if ( (blkif->domid == domid) && (blkif->handle == handle) ) - goto found_match; + { + if ( blkif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } pblkif = &blkif->hash_next; } - destroy->status = BLKIF_STATUS_INTERFACE_NOT_FOUND; + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; return; - found_match: - free_irq(blkif->irq, NULL); - unbind_evtchn_from_irq(blkif->evtchn); + still_connected: + destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: *pblkif = blkif->hash_next; + destroy_all_vbds(blkif); kmem_cache_free(blkif_cachep, blkif); - destroy->status = BLKIF_STATUS_OKAY; + destroy->status = BLKIF_BE_STATUS_OKAY; } +void blkif_connect(blkif_be_connect_t *connect) +{ + domid_t domid = connect->domid; + unsigned int handle = connect->blkif_handle; + unsigned int evtchn = connect->evtchn; + unsigned long shmem_frame = connect->shmem_frame; + struct vm_struct *vma; + pgprot_t prot; + int error; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_connect attempted for non-existent blkif (%llu,%u)\n", + connect->domid, connect->blkif_handle); + connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) + { + connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); + error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), + shmem_frame<<PAGE_SHIFT, PAGE_SIZE, + prot, domid); + if ( error != 0 ) + { + if ( error == -ENOMEM ) + connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + else if ( error == -EFAULT ) + connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; + else + connect->status = BLKIF_BE_STATUS_ERROR; + vfree(vma->addr); + return; + } + + if ( blkif->status != DISCONNECTED ) + { + connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + vfree(vma->addr); + return; + } + + blkif->evtchn = evtchn; + blkif->irq = bind_evtchn_to_irq(evtchn); + blkif->shmem_frame = shmem_frame; + blkif->blk_ring_base = (blkif_ring_t *)vma->addr; + blkif->status = CONNECTED; + blkif_get(blkif); + + request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif); + + connect->status = BLKIF_BE_STATUS_OKAY; +} + +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id) +{ + domid_t domid = disconnect->domid; + unsigned int handle = disconnect->blkif_handle; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_disconnect attempted for non-existent blkif" + " (%llu,%u)\n", disconnect->domid, disconnect->blkif_handle); + disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return 1; /* Caller will send response error message. */ + } + + if ( blkif->status == CONNECTED ) + { + blkif->status = DISCONNECTING; + blkif->disconnect_rspid = rsp_id; + wmb(); /* Let other CPUs see the status change. */ + free_irq(blkif->irq, NULL); + blkif_deschedule(blkif); + blkif_put(blkif); + } + + return 0; /* Caller should not send response message. */ +} + +void __init blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); + memset(blkif_hash, 0, sizeof(blkif_hash)); +} diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c index 1e6190c3e6..2582287360 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c @@ -24,6 +24,18 @@ #define MAX_PENDING_REQS 64 #define BATCH_PER_DOMAIN 16 +static struct vm_struct *mmap_vma; +#define MMAP_PAGES_PER_SEGMENT \ + ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1) +#define MMAP_PAGES_PER_REQUEST \ + (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT) +#define MMAP_PAGES \ + (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) +#define MMAP_VADDR(_req,_seg) \ + ((unsigned long)mmap_vma->addr + \ + ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE)) + /* * Each outstanding request that we've passed to the lower device layers has a * 'pending_req' allocated to it. Each buffer_head that completes decrements @@ -46,22 +58,11 @@ static PEND_RING_IDX pending_prod, pending_cons; static kmem_cache_t *buffer_head_cachep; -static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned; - -static int lock_buffer(blkif_t *blkif, - unsigned long buffer, - unsigned short size, - int writeable_buffer); -static void unlock_buffer(unsigned long buffer, - unsigned short size, - int writeable_buffer); - -static void io_schedule(unsigned long unused); static int do_block_io_op(blkif_t *blkif, int max_to_do); -static void dispatch_rw_block_io(blkif_t *blkif, - blk_ring_req_entry_t *req); +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req); +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req); static void make_response(blkif_t *blkif, unsigned long id, - unsigned short op, unsigned long st); + unsigned short op, int st); /****************************************************************** @@ -95,7 +96,7 @@ static void add_to_blkdev_list_tail(blkif_t *blkif) unsigned long flags; if ( __on_blkdev_list(blkif) ) return; spin_lock_irqsave(&io_schedule_list_lock, flags); - if ( !__on_blkdev_list(blkif) ) + if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) { list_add_tail(&blkif->blkdev_list, &io_schedule_list); blkif_get(blkif); @@ -108,8 +109,6 @@ static void add_to_blkdev_list_tail(blkif_t *blkif) * SCHEDULER FUNCTIONS */ -static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0); - static void io_schedule(unsigned long unused) { blkif_t *blkif; @@ -132,6 +131,8 @@ static void io_schedule(unsigned long unused) run_task_queue(&tq_disk); } +static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0); + static void maybe_trigger_io_schedule(void) { /* @@ -155,28 +156,26 @@ static void maybe_trigger_io_schedule(void) static void end_block_io_op(struct buffer_head *bh, int uptodate) { pending_req_t *pending_req = bh->b_private; + unsigned long flags; /* An error fails the entire request. */ if ( !uptodate ) { DPRINTK("Buffer not up-to-date at end of operation\n"); - pending_req->status = 2; + pending_req->status = BLKIF_RSP_ERROR; } - unlock_buffer(virt_to_phys(bh->b_data), - bh->b_size, - (pending_req->operation==READ)); - if ( atomic_dec_and_test(&pending_req->pendcnt) ) { + int pending_idx = pending_req - pending_reqs; + vmfree_area_pages(MMAP_VADDR(pending_idx, 0), + MMAP_PAGES_PER_REQUEST * PAGE_SIZE); make_response(pending_req->blkif, pending_req->id, pending_req->operation, pending_req->status); blkif_put(pending_req->blkif); - spin_lock(&pend_prod_lock); - pending_ring[MASK_PEND_IDX(pending_prod)] = - pending_req - pending_reqs; - pending_prod++; - spin_unlock(&pend_prod_lock); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + spin_unlock_irqrestore(&pend_prod_lock, flags); maybe_trigger_io_schedule(); } } @@ -200,45 +199,10 @@ void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) * DOWNWARD CALLS -- These interface with the block-device layer proper. */ -static int lock_buffer(blkif_t *blkif, - unsigned long buffer, - unsigned short size, - int writeable_buffer) -{ - unsigned long pfn; - - for ( pfn = buffer >> PAGE_SHIFT; - pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); - pfn++ ) - { - } - - return 1; - - fail: - while ( pfn-- > (buffer >> PAGE_SHIFT) ) - { - } - return 0; -} - -static void unlock_buffer(unsigned long buffer, - unsigned short size, - int writeable_buffer) -{ - unsigned long pfn; - - for ( pfn = buffer >> PAGE_SHIFT; - pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); - pfn++ ) - { - } -} - static int do_block_io_op(blkif_t *blkif, int max_to_do) { - blk_ring_t *blk_ring = blkif->blk_ring_base; - blk_ring_req_entry_t *req; + blkif_ring_t *blk_ring = blkif->blk_ring_base; + blkif_request_t *req; BLK_RING_IDX i; int more_to_do = 0; @@ -262,11 +226,15 @@ static int do_block_io_op(blkif_t *blkif, int max_to_do) dispatch_rw_block_io(blkif, req); break; + case BLKIF_OP_PROBE: + dispatch_probe(blkif, req); + break; + default: DPRINTK("error: unknown block io operation [%d]\n", blk_ring->ring[i].req.operation); make_response(blkif, blk_ring->ring[i].req.id, - blk_ring->ring[i].req.operation, 1); + blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR); break; } } @@ -275,24 +243,64 @@ static int do_block_io_op(blkif_t *blkif, int max_to_do) return more_to_do; } -static void dispatch_rw_block_io(blkif_t *blkif, - blk_ring_req_entry_t *req) +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req) +{ + int i, rc, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + pgprot_t prot; + + /* Check that number of segments is sane. */ + if ( unlikely(req->nr_segments == 0) || + unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) + { + DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); + goto bad_descriptor; + } + + prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW); + for ( i = 0; i < req->nr_segments; i++ ) + { + if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) ) + goto bad_descriptor; + rc = direct_remap_area_pages(&init_mm, + MMAP_VADDR(pending_idx, i), + req->buffer_and_sects[i] & PAGE_MASK, + PAGE_SIZE, prot, blkif->domid); + if ( rc != 0 ) + goto bad_descriptor; + } + + rc = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), + (req->nr_segments * PAGE_SIZE) / sizeof(vdisk_t)); + + vmfree_area_pages(MMAP_VADDR(pending_idx, 0), + MMAP_PAGES_PER_REQUEST * PAGE_SIZE); + make_response(blkif, req->id, req->operation, rc); + return; + + bad_descriptor: + vmfree_area_pages(MMAP_VADDR(pending_idx, 0), + MMAP_PAGES_PER_REQUEST * PAGE_SIZE); + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); +} + +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) { extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); struct buffer_head *bh; - int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ; + int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; unsigned short nr_sects; unsigned long buffer; - int i, tot_sects; + int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; pending_req_t *pending_req; + pgprot_t prot; /* We map virtual scatter/gather segments to physical segments. */ int new_segs, nr_psegs = 0; - phys_seg_t phys_seg[MAX_BLK_SEGS * 2]; + phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; /* Check that number of segments is sane. */ if ( unlikely(req->nr_segments == 0) || - unlikely(req->nr_segments > MAX_BLK_SEGS) ) + unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) { DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); goto bad_descriptor; @@ -310,8 +318,11 @@ static void dispatch_rw_block_io(blkif_t *blkif, nr_sects = req->buffer_and_sects[i] & 0x1FF; if ( unlikely(nr_sects == 0) ) + continue; + + if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) ) { - DPRINTK("zero-sized data request\n"); + DPRINTK("Too many sectors in segment\n"); goto bad_descriptor; } @@ -333,29 +344,41 @@ static void dispatch_rw_block_io(blkif_t *blkif, } nr_psegs += new_segs; - ASSERT(nr_psegs <= MAX_BLK_SEGS*2); + ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2); } + /* Nonsensical zero-sized request? */ + if ( unlikely(nr_psegs == 0) ) + goto bad_descriptor; + + if ( operation == READ ) + prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW); + else + prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED); + for ( i = 0; i < nr_psegs; i++ ) { - if ( unlikely(!lock_buffer(blkif, phys_seg[i].buffer, - phys_seg[i].nr_sects << 9, - operation==READ)) ) + unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + + (phys_seg[i].nr_sects << 9) + + (PAGE_SIZE - 1)) & PAGE_MASK; + int rc = direct_remap_area_pages(&init_mm, + MMAP_VADDR(pending_idx, i), + phys_seg[i].buffer & PAGE_MASK, + sz, prot, blkif->domid); + if ( rc != 0 ) { DPRINTK("invalid buffer\n"); - while ( i-- > 0 ) - unlock_buffer(phys_seg[i].buffer, - phys_seg[i].nr_sects << 9, - operation==READ); + vmfree_area_pages(MMAP_VADDR(pending_idx, 0), + MMAP_PAGES_PER_REQUEST * PAGE_SIZE); goto bad_descriptor; } } - pending_req = &pending_reqs[pending_ring[MASK_PEND_IDX(pending_cons++)]]; + pending_req = &pending_reqs[pending_idx]; pending_req->blkif = blkif; pending_req->id = req->id; pending_req->operation = operation; - pending_req->status = 0; + pending_req->status = BLKIF_RSP_OKAY; atomic_set(&pending_req->pendcnt, nr_psegs); blkif_get(blkif); @@ -363,38 +386,37 @@ static void dispatch_rw_block_io(blkif_t *blkif, /* Now we pass each segment down to the real blkdev layer. */ for ( i = 0; i < nr_psegs; i++ ) { - bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL); + bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC); if ( unlikely(bh == NULL) ) panic("bh is null\n"); memset(bh, 0, sizeof (struct buffer_head)); - + + init_waitqueue_head(&bh->b_wait); bh->b_size = phys_seg[i].nr_sects << 9; bh->b_dev = phys_seg[i].dev; + bh->b_rdev = phys_seg[i].dev; bh->b_rsector = (unsigned long)phys_seg[i].sector_number; - - /* SMH: we store a 'pseudo-virtual' bogus address in b_data since - later code will undo this transformation (i.e. +-PAGE_OFFSET). */ - bh->b_data = phys_to_virt(phys_seg[i].buffer); - - /* SMH: bh_phys() uses the below field as a 'cheap' virt_to_phys */ - bh->b_page = &mem_map[phys_seg[i].buffer>>PAGE_SHIFT]; + bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + + (phys_seg[i].buffer & ~PAGE_MASK); bh->b_end_io = end_block_io_op; bh->b_private = pending_req; - bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock); + bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | + (1 << BH_Req) | (1 << BH_Launder); if ( operation == WRITE ) bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); atomic_set(&bh->b_count, 1); /* Dispatch a single request. We'll flush it to disc later. */ - submit_bh(operation, bh); + generic_make_request(operation, bh); } + pending_cons++; return; bad_descriptor: - make_response(blkif, req->id, req->operation, 1); + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); } @@ -405,12 +427,13 @@ static void dispatch_rw_block_io(blkif_t *blkif, static void make_response(blkif_t *blkif, unsigned long id, - unsigned short op, unsigned long st) + unsigned short op, int st) { - blk_ring_resp_entry_t *resp; + blkif_response_t *resp; + unsigned long flags; /* Place on the response ring for the relevant domain. */ - spin_lock(&blkif->blk_ring_lock); + spin_lock_irqsave(&blkif->blk_ring_lock, flags); resp = &blkif->blk_ring_base-> ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp; resp->id = id; @@ -418,64 +441,28 @@ static void make_response(blkif_t *blkif, unsigned long id, resp->status = st; wmb(); blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod; - spin_unlock(&blkif->blk_ring_lock); + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); /* Kick the relevant domain. */ notify_via_evtchn(blkif->evtchn); } -static void blkif_debug_int(int irq, void *unused, struct pt_regs *regs) +void blkif_deschedule(blkif_t *blkif) { -#if 0 - unsigned long flags; - struct task_struct *p; - blk_ring_t *blk_ring; - int i; - - printk("Dumping block queue stats: nr_pending = %d" - " (prod=0x%08x,cons=0x%08x)\n", - NR_PENDING_REQS, pending_prod, pending_cons); - - read_lock_irqsave(&tasklist_lock, flags); - for_each_domain ( p ) - { - printk("Domain: %llu\n", blkif->domain); - blk_ring = blkif->blk_ring_base; - printk(" req_prod:0x%08x, req_cons:0x%08x resp_prod:0x%08x/" - "0x%08x on_list=%d\n", - blk_ring->req_prod, blkif->blk_req_cons, - blk_ring->resp_prod, blkif->blk_resp_prod, - __on_blkdev_list(p)); - } - read_unlock_irqrestore(&tasklist_lock, flags); - - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - { - printk("Pend%d: dom=%p, id=%08lx, cnt=%d, op=%d, status=%d\n", - i, pending_reqs[i].domain, pending_reqs[i].id, - atomic_read(&pending_reqs[i].pendcnt), - pending_reqs[i].operation, pending_reqs[i].status); - } -#endif + remove_from_blkdev_list(blkif); } -void unlink_blkdev_info(blkif_t *blkif) +static int __init init_module(void) { - unsigned long flags; + int i; - spin_lock_irqsave(&io_schedule_list_lock, flags); - if ( __on_blkdev_list(blkif) ) + blkif_interface_init(); + + if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL ) { - list_del(&blkif->blkdev_list); - blkif->blkdev_list.next = (void *)0xdeadbeef; - blkif_put(blkif); + printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n"); + return -ENOMEM; } - spin_unlock_irqrestore(&io_schedule_list_lock, flags); -} - -static int __init init_module(void) -{ - int i; pending_cons = 0; pending_prod = MAX_PENDING_REQS; @@ -483,20 +470,15 @@ static int __init init_module(void) for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i; - for ( i = 0; i < NR_CPUS; i++ ) - completed_bhs[i] = NULL; - spin_lock_init(&io_schedule_list_lock); INIT_LIST_HEAD(&io_schedule_list); - if ( request_irq(bind_virq_to_irq(VIRQ_DEBUG), blkif_debug_int, - SA_SHIRQ, "blkif-backend-dbg", &blkif_debug_int) != 0 ) - printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); - buffer_head_cachep = kmem_cache_create( "buffer_head_cache", sizeof(struct buffer_head), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + blkif_ctrlif_init(); + return 0; } diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c index bd6c40125c..19b0b3015d 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c @@ -8,7 +8,7 @@ #include "common.h" -void vbd_create(blkif_vbd_create_t *create) +void vbd_create(blkif_be_vbd_create_t *create) { vbd_t *vbd; rb_node_t **rb_p, *rb_parent = NULL; @@ -18,9 +18,9 @@ void vbd_create(blkif_vbd_create_t *create) blkif = blkif_find_by_handle(create->domid, create->blkif_handle); if ( unlikely(blkif == NULL) ) { - DPRINTK("vbd_create attempted for non-existent blkif (%llu,&u)\n", + DPRINTK("vbd_create attempted for non-existent blkif (%llu,%u)\n", create->domid, create->blkif_handle); - create->status = BLKIF_STATUS_INTERFACE_NOT_FOUND; + create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; return; } @@ -42,49 +42,50 @@ void vbd_create(blkif_vbd_create_t *create) else { DPRINTK("vbd_create attempted for already existing vbd\n"); - create->status = BLKIF_STATUS_VBD_EXISTS; + create->status = BLKIF_BE_STATUS_VBD_EXISTS; goto out; } } - if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) ) { DPRINTK("vbd_create: out of memory\n"); - create->status = BLKIF_STATUS_OUT_OF_MEMORY; + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; goto out; } - vbd->vdevice = vdevice; - vbd->mode = create->mode; - vbd->type = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; - vbd->extents = NULL; + vbd->vdevice = vdevice; + vbd->readonly = create->readonly; + vbd->type = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; + vbd->extents = NULL; rb_link_node(&vbd->rb, rb_parent, rb_p); rb_insert_color(&vbd->rb, &blkif->vbd_rb); - create->status = BLKIF_STATUS_OKAY; + DPRINTK("Successful creation of vdev=%04x (dom=%llu)\n", + vdevice, create->domid); + create->status = BLKIF_BE_STATUS_OKAY; out: spin_unlock(&blkif->vbd_lock); - blkif_put(blkif); } /* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */ -void vbd_grow(blkif_vbd_grow_t *grow) +void vbd_grow(blkif_be_vbd_grow_t *grow) { - blkif_t *blkif; - xen_extent_le_t **px, *x; - vbd_t *vbd = NULL; - rb_node_t *rb; - blkif_vdev_t vdevice = grow->vdevice; + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = grow->vdevice; blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle); if ( unlikely(blkif == NULL) ) { - DPRINTK("vbd_grow attempted for non-existent blkif (%llu,&u)\n", + DPRINTK("vbd_grow attempted for non-existent blkif (%llu,%u)\n", grow->domid, grow->blkif_handle); - grow->status = BLKIF_STATUS_INTERFACE_NOT_FOUND; + grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; return; } @@ -105,49 +106,51 @@ void vbd_grow(blkif_vbd_grow_t *grow) if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) { DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n"); - grow->status = BLKIF_STATUS_VBD_NOT_FOUND; + grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; goto out; } - if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL)) == NULL) ) + if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), + GFP_ATOMIC)) == NULL) ) { DPRINTK("vbd_grow: out of memory\n"); - grow->status = BLKIF_STATUS_OUT_OF_MEMORY; + grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; goto out; } x->extent.device = grow->extent.device; x->extent.sector_start = grow->extent.sector_start; x->extent.sector_length = grow->extent.sector_length; - x->next = (xen_extent_le_t *)NULL; + x->next = (blkif_extent_le_t *)NULL; for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) continue; *px = x; - grow->status = BLKIF_STATUS_OKAY; + DPRINTK("Successful grow of vdev=%04x (dom=%llu)\n", + vdevice, grow->domid); + grow->status = BLKIF_BE_STATUS_OKAY; out: spin_unlock(&blkif->vbd_lock); - blkif_put(blkif); } -void vbd_shrink(blkif_vbd_shrink_t *shrink) +void vbd_shrink(blkif_be_vbd_shrink_t *shrink) { - blkif_t *blkif; - xen_extent_le_t **px, *x; - vbd_t *vbd = NULL; - rb_node_t *rb; - blkif_vdev_t vdevice = shrink->vdevice; + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = shrink->vdevice; blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle); if ( unlikely(blkif == NULL) ) { - DPRINTK("vbd_shrink attempted for non-existent blkif (%llu,&u)\n", + DPRINTK("vbd_shrink attempted for non-existent blkif (%llu,%u)\n", shrink->domid, shrink->blkif_handle); - shrink->status = BLKIF_STATUS_INTERFACE_NOT_FOUND; + shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; return; } @@ -167,13 +170,13 @@ void vbd_shrink(blkif_vbd_shrink_t *shrink) if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) { - shrink->status = BLKIF_STATUS_VBD_NOT_FOUND; + shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; goto out; } if ( unlikely(vbd->extents == NULL) ) { - shrink->status = BLKIF_STATUS_EXTENT_NOT_FOUND; + shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; goto out; } @@ -185,28 +188,27 @@ void vbd_shrink(blkif_vbd_shrink_t *shrink) *px = x->next; kfree(x); - shrink->status = BLKIF_STATUS_OKAY; + shrink->status = BLKIF_BE_STATUS_OKAY; out: spin_unlock(&blkif->vbd_lock); - blkif_put(blkif); } -void vbd_destroy(blkif_vbd_destroy_t *destroy) +void vbd_destroy(blkif_be_vbd_destroy_t *destroy) { - blkif_t *blkif; - vbd_t *vbd; - rb_node_t *rb; - xen_extent_le_t *x, *t; - blkif_vdev_t vdevice = destroy->vdevice; + blkif_t *blkif; + vbd_t *vbd; + rb_node_t *rb; + blkif_extent_le_t *x, *t; + blkif_vdev_t vdevice = destroy->vdevice; blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); if ( unlikely(blkif == NULL) ) { - DPRINTK("vbd_destroy attempted for non-existent blkif (%llu,&u)\n", + DPRINTK("vbd_destroy attempted for non-existent blkif (%llu,%u)\n", destroy->domid, destroy->blkif_handle); - destroy->status = BLKIF_STATUS_INTERFACE_NOT_FOUND; + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; return; } @@ -224,7 +226,7 @@ void vbd_destroy(blkif_vbd_destroy_t *destroy) goto found; } - destroy->status = BLKIF_STATUS_VBD_NOT_FOUND; + destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; goto out; found: @@ -241,7 +243,6 @@ void vbd_destroy(blkif_vbd_destroy_t *destroy) out: spin_unlock(&blkif->vbd_lock); - blkif_put(blkif); } @@ -249,7 +250,7 @@ void destroy_all_vbds(blkif_t *blkif) { vbd_t *vbd; rb_node_t *rb; - xen_extent_le_t *x, *t; + blkif_extent_le_t *x, *t; spin_lock(&blkif->vbd_lock); @@ -273,51 +274,30 @@ void destroy_all_vbds(blkif_t *blkif) } -static int vbd_probe_single(xen_disk_info_t *xdi, - vbd_t *vbd, - struct task_struct *p) +static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd) { - xen_extent_le_t *x; - xen_disk_t cur_disk; + blkif_extent_le_t *x; - if ( xdi->count == xdi->max ) - { - DPRINTK("vbd_probe_devices: out of space for probe.\n"); - return -ENOMEM; - } - - cur_disk.device = vbd->vdevice; - cur_disk.info = vbd->type; - if ( !VBD_CAN_WRITE(vbd) ) - cur_disk.info |= XD_FLAG_RO; - cur_disk.capacity = 0ULL; + vbd_info->device = vbd->vdevice; + vbd_info->info = vbd->type; + if ( vbd->readonly ) + vbd_info->info |= VDISK_FLAG_RO; + vbd_info->capacity = 0ULL; for ( x = vbd->extents; x != NULL; x = x->next ) - cur_disk.capacity += x->extent.nr_sectors; - cur_disk.domain = p->domain; + vbd_info->capacity += x->extent.sector_length; - /* Now copy into relevant part of user-space buffer */ - if( copy_to_user(&xdi->disks[xdi->count], - &cur_disk, - sizeof(xen_disk_t)) ) - { - DPRINTK("vbd_probe_devices: copy_to_user failed\n"); - return -EFAULT; - } - - xdi->count++; - return 0; } -static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p) +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds) { - int rc = 0; + int rc = 0, nr_vbds = 0; rb_node_t *rb; - spin_lock(&p->vbd_lock); + spin_lock(&blkif->vbd_lock); - if ( (rb = p->vbd_rb.rb_node) == NULL ) + if ( (rb = blkif->vbd_rb.rb_node) == NULL ) goto out; new_subtree: @@ -328,7 +308,10 @@ static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p) for ( ; ; ) { /* STEP 2. Dealt with left subtree. Now process current node. */ - if ( (rc = vbd_probe_single(xdi, rb_entry(rb, vbd_t, rb), p)) != 0 ) + if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], + rb_entry(rb, vbd_t, rb))) != 0 ) + goto out; + if ( ++nr_vbds == max_vbds ) goto out; /* STEP 3. Process right subtree, if any. */ @@ -355,146 +338,22 @@ static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p) } out: - spin_unlock(&p->vbd_lock); - return rc; -} - - -/* - * Return information about the VBDs available for a given domain, or for all - * domains; in the general case the 'domain' argument will be 0 which means - * "information about the caller"; otherwise the 'domain' argument will - * specify either a given domain, or all domains ("VBD_PROBE_ALL") -- both of - * these cases require the caller to be privileged. - */ -long vbd_probe(vbd_probe_t *probe) -{ - struct task_struct *p = NULL; - unsigned long flags; - long ret = 0; - - if ( probe->domain != 0 ) - { - /* We can only probe for ourselves (unless we're privileged). */ - if( (probe->domain != current->domain) && !IS_PRIV(current) ) - return -EPERM; - - if ( (probe->domain != VBD_PROBE_ALL) && - ((p = find_domain_by_id(probe->domain)) == NULL) ) - { - DPRINTK("vbd_probe attempted for non-existent domain %llu\n", - probe->domain); - return -EINVAL; - } - } - else - { - /* Default is to probe for ourselves. */ - p = current; - get_task_struct(p); /* to mirror final put_task_struct */ - } - - if ( probe->domain == VBD_PROBE_ALL ) - { - read_lock_irqsave(&tasklist_lock, flags); - for_each_domain ( p ) - { - if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) - { - read_unlock_irqrestore(&tasklist_lock, flags); - goto out; - } - } - read_unlock_irqrestore(&tasklist_lock, flags); - } - else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) - goto out; - - out: - if ( ret != 0 ) - DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); - if ( p != NULL ) - put_task_struct(p); - return ret; -} - - -long vbd_info(vbd_info_t *info) -{ - struct task_struct *p; - xen_extent_le_t *x; - xen_extent_t *extents; - vbd_t *vbd = NULL; - rb_node_t *rb; - long ret = 0; - - if ( (info->domain != current->domain) && !IS_PRIV(current) ) - return -EPERM; - - if ( (p = find_domain_by_id(info->domain)) == NULL ) - { - DPRINTK("vbd_info attempted for non-existent domain %llu\n", - info->domain); - return -EINVAL; - } - - spin_lock(&p->vbd_lock); - - rb = p->vbd_rb.rb_node; - while ( rb != NULL ) - { - vbd = rb_entry(rb, vbd_t, rb); - if ( info->vdevice < vbd->vdevice ) - rb = rb->rb_left; - else if ( info->vdevice > vbd->vdevice ) - rb = rb->rb_right; - else - break; - } - - if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != info->vdevice) ) - { - DPRINTK("vbd_info attempted on non-existent VBD.\n"); - ret = -EINVAL; - goto out; - } - - info->mode = vbd->mode; - info->nextents = 0; - - extents = info->extents; - for ( x = vbd->extents; x != NULL; x = x->next ) - { - if ( info->nextents == info->maxextents ) - break; - if ( copy_to_user(extents, &x->extent, sizeof(xen_extent_t)) ) - { - DPRINTK("vbd_info: copy_to_user failed\n"); - ret = -EFAULT; - goto out; - } - extents++; - info->nextents++; - } - - out: - spin_unlock(&p->vbd_lock); - put_task_struct(p); - return ret; + spin_unlock(&blkif->vbd_lock); + return (rc == 0) ? nr_vbds : rc; } -int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation) +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation) { - xen_extent_le_t *x; - vbd_t *vbd; - rb_node_t *rb; - xen_sector_t sec_off; - unsigned long nr_secs; + blkif_extent_le_t *x; + vbd_t *vbd; + rb_node_t *rb; + blkif_sector_t sec_off; + unsigned long nr_secs; - spin_lock(&p->vbd_lock); + spin_lock(&blkif->vbd_lock); - rb = p->vbd_rb.rb_node; + rb = blkif->vbd_rb.rb_node; while ( rb != NULL ) { vbd = rb_entry(rb, vbd_t, rb); @@ -507,42 +366,41 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation) } DPRINTK("vbd_translate; domain %llu attempted to access " - "non-existent VBD.\n", p->domain); + "non-existent VBD.\n", blkif->domid); - spin_unlock(&p->vbd_lock); + spin_unlock(&blkif->vbd_lock); return -ENODEV; found: - if ( ((operation == READ) && !VBD_CAN_READ(vbd)) || - ((operation == WRITE) && !VBD_CAN_WRITE(vbd)) ) + if ( (operation == WRITE) && vbd->readonly ) { - spin_unlock(&p->vbd_lock); + spin_unlock(&blkif->vbd_lock); return -EACCES; } /* - * Now iterate through the list of xen_extents, working out which should + * Now iterate through the list of blkif_extents, working out which should * be used to perform the translation. */ sec_off = pseg->sector_number; nr_secs = pseg->nr_sects; for ( x = vbd->extents; x != NULL; x = x->next ) { - if ( sec_off < x->extent.nr_sectors ) + if ( sec_off < x->extent.sector_length ) { pseg->dev = x->extent.device; - pseg->sector_number = x->extent.start_sector + sec_off; - if ( unlikely((sec_off + nr_secs) > x->extent.nr_sectors) ) + pseg->sector_number = x->extent.sector_start + sec_off; + if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) ) goto overrun; spin_unlock(&p->vbd_lock); return 1; } - sec_off -= x->extent.nr_sectors; + sec_off -= x->extent.sector_length; } DPRINTK("vbd_translate: end of vbd.\n"); - spin_unlock(&p->vbd_lock); + spin_unlock(&blkif->vbd_lock); return -EACCES; /* @@ -554,7 +412,7 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation) overrun: /* Adjust length of first chunk to run to end of first extent. */ - pseg[0].nr_sects = x->extent.nr_sectors - sec_off; + pseg[0].nr_sects = x->extent.sector_length - sec_off; /* Set second chunk buffer and length to start where first chunk ended. */ pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9); @@ -562,7 +420,7 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation) /* Now move to the next extent. Check it exists and is long enough! */ if ( unlikely((x = x->next) == NULL) || - unlikely(x->extent.nr_sectors < pseg[1].nr_sects) ) + unlikely(x->extent.sector_length < pseg[1].nr_sects) ) { DPRINTK("vbd_translate: multiple overruns or end of vbd.\n"); spin_unlock(&p->vbd_lock); @@ -571,8 +429,8 @@ int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation) /* Store the real device and start sector for the second chunk. */ pseg[1].dev = x->extent.device; - pseg[1].sector_number = x->extent.start_sector; + pseg[1].sector_number = x->extent.sector_start; - spin_unlock(&p->vbd_lock); + spin_unlock(&blkif->vbd_lock); return 2; } diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h index f6e8a4d5c8..5db2b48a51 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h @@ -16,24 +16,27 @@ #define BLKIF_OP_WRITE 1 #define BLKIF_OP_PROBE 2 -/* NB. Ring size must be small enough for sizeof(blk_ring_t) <= PAGE_SIZE. */ +/* NB. Ring size must be small enough for sizeof(blkif_ring_t) <= PAGE_SIZE. */ #define BLKIF_RING_SIZE 64 /* * Maximum scatter/gather segments per request. - * This is carefully chosen so that sizeof(blk_ring_t) <= PAGE_SIZE. + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. * NB. This could be 12 if the ring indexes weren't stored in the same page. */ -#define BLKIF_REQUEST_MAX_SEGMENTS 11 +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 + +#define BLKIF_MAX_SECTORS_PER_SEGMENT 16 typedef struct { unsigned char operation; /* BLKIF_OP_??? */ - unsigned char nr_segments; /* number of segments (<= MAX_BLK_SEGS) */ + unsigned char nr_segments; /* number of segments */ blkif_vdev_t device; /* only for read/write requests */ unsigned long id; /* private guest value, echoed in resp */ - xen_sector_t sector_number; /* start sector idx on disk (r/w only) */ - /* Least 9 bits is 'nr_sects'. High 23 bits is the address. */ - unsigned long buffer_and_sects[MAX_BLK_SEGS]; + blkif_sector_t sector_number; /* start sector idx on disk (r/w only) */ + /* Least 9 bits is 'nr_sects'. High 23 bits is the address. */ + /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */ + unsigned long buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; } blkif_request_t; typedef struct { @@ -59,8 +62,8 @@ typedef unsigned int BLKIF_RING_IDX; #define MASK_BLKIF_IDX(_i) ((_i)&(BLKIF_RING_SIZE-1)) typedef struct { - BLKIF_RING_IDX req_prod; /* Request producer. Updated by guest OS. */ - BLKIF_RING_IDX resp_prod; /* Response producer. Updated by Xen. */ + BLKIF_RING_IDX req_prod; /* Request producer. Updated by front-end. */ + BLKIF_RING_IDX resp_prod; /* Response producer. Updated by back-end. */ union { blkif_request_t req; blkif_response_t resp; @@ -103,7 +106,7 @@ typedef struct { typedef struct { blkif_vdev_t device; /* Device number (opaque 16 bit value). */ unsigned short info; /* Device type and flags (VDISK_*). */ - xen_sector_t capacity; /* Size in terms of 512-byte sectors. */ + blkif_sector_t capacity; /* Size in terms of 512-byte sectors. */ } vdisk_t; #endif /* __SHARED_BLKIF_H__ */ diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/Makefile b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/Makefile index 35986ca54a..b0d27cf698 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/Makefile +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/Makefile @@ -1,3 +1,3 @@ O_TARGET := drv.o -obj-y := block.o vbd.o +obj-y := main.o vbd.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/common.h index e41e03970e..2d4415bdef 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.h +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/common.h @@ -1,11 +1,11 @@ /****************************************************************************** - * block.h + * arch/xen/drivers/blkif/frontend/common.h * * Shared definitions between all levels of XenoLinux Virtual block devices. */ -#ifndef __XEN_DRIVERS_BLOCK_H__ -#define __XEN_DRIVERS_BLOCK_H__ +#ifndef __XEN_DRIVERS_COMMON_H__ +#define __XEN_DRIVERS_COMMON_H__ #include <linux/config.h> #include <linux/module.h> @@ -27,6 +27,8 @@ #include <asm/atomic.h> #include <asm/uaccess.h> +#include "../blkif.h" + #if 0 #define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a ) #else @@ -52,14 +54,14 @@ typedef struct xl_disk { int usage; } xl_disk_t; -extern int xen_control_msg(int operration, char *buffer, int size); -extern int xen_block_open(struct inode *inode, struct file *filep); -extern int xen_block_release(struct inode *inode, struct file *filep); -extern int xen_block_ioctl(struct inode *inode, struct file *filep, +extern int blkif_open(struct inode *inode, struct file *filep); +extern int blkif_release(struct inode *inode, struct file *filep); +extern int blkif_ioctl(struct inode *inode, struct file *filep, unsigned command, unsigned long argument); -extern int xen_block_check(kdev_t dev); -extern int xen_block_revalidate(kdev_t dev); -extern void do_xlblk_request (request_queue_t *rq); +extern int blkif_check(kdev_t dev); +extern int blkif_revalidate(kdev_t dev); +extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp); +extern void do_blkif_request (request_queue_t *rq); extern void xlvbd_update_vbds(void); @@ -79,4 +81,4 @@ static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev) extern int xlvbd_init(void); extern void xlvbd_cleanup(void); -#endif /* __XEN_DRIVERS_BLOCK_H__ */ +#endif /* __XEN_DRIVERS_COMMON_H__ */ diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c index d00dd98f7b..29cc01d087 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/block.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c @@ -1,5 +1,5 @@ /****************************************************************************** - * block.c + * arch/xen/drivers/blkif/frontend/main.c * * Xenolinux virtual block-device driver. * @@ -7,32 +7,35 @@ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge */ -#include "block.h" +#include "common.h" #include <linux/blk.h> #include <linux/cdrom.h> #include <linux/tqueue.h> #include <linux/sched.h> -#include <scsi/scsi.h> - #include <linux/interrupt.h> +#include <scsi/scsi.h> +#include <asm/ctrl_if.h> typedef unsigned char byte; /* from linux/ide.h */ -#define STATE_ACTIVE 0 -#define STATE_SUSPENDED 1 -#define STATE_CLOSED 2 -static unsigned int state = STATE_SUSPENDED; +#define BLKIF_STATE_CLOSED 0 +#define BLKIF_STATE_DISCONNECTED 1 +#define BLKIF_STATE_CONNECTED 2 +static unsigned int blkif_state = BLKIF_STATE_CLOSED; +static unsigned int blkif_evtchn, blkif_irq; -/* Dynamically-mapped IRQs. */ -static int xlblk_response_irq, xlblk_update_irq; +static struct tq_struct blkif_statechange_tq; -static blk_ring_t *blk_ring; +static int blkif_control_rsp_valid; +static blkif_response_t blkif_control_rsp; + +static blkif_ring_t *blk_ring; static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */ static BLK_RING_IDX req_prod; /* Private request producer. */ /* We plug the I/O ring if the driver is suspended or if the ring is full. */ #define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \ - (state != STATE_ACTIVE)) + (blkif_state != BLKIF_STATE_CONNECTED)) /* @@ -49,39 +52,27 @@ static int sg_operation = -1; static unsigned long sg_next_sect; #define DISABLE_SCATTERGATHER() (sg_operation = -1) -static inline void signal_requests_to_xen(void) +static inline void flush_requests(void) { - block_io_op_t op; - DISABLE_SCATTERGATHER(); blk_ring->req_prod = req_prod; - - op.cmd = BLOCK_IO_OP_SIGNAL; - HYPERVISOR_block_io_op(&op); - return; + notify_via_evtchn(blkif_evtchn); } /* - * xlblk_update_int/update-vbds_task - handle VBD update events from Xen - * - * Schedule a task for keventd to run, which will update the VBDs and perform - * the corresponding updates to our view of VBD state, so the XenoLinux will - * respond to changes / additions / deletions to the set of VBDs automatically. + * blkif_update_int/update-vbds_task - handle VBD update events. + * Schedule a task for keventd to run, which will update the VBDs and perform + * the corresponding updates to our view of VBD state. */ static struct tq_struct update_tq; static void update_vbds_task(void *unused) { xlvbd_update_vbds(); } -static void xlblk_update_int(int irq, void *dev_id, struct pt_regs *ptregs) -{ - update_tq.routine = update_vbds_task; - schedule_task(&update_tq); -} -int xen_block_open(struct inode *inode, struct file *filep) +int blkif_open(struct inode *inode, struct file *filep) { short xldev = inode->i_rdev; struct gendisk *gd = get_gendisk(xldev); @@ -122,7 +113,7 @@ int xen_block_open(struct inode *inode, struct file *filep) } -int xen_block_release(struct inode *inode, struct file *filep) +int blkif_release(struct inode *inode, struct file *filep) { xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); @@ -132,15 +123,17 @@ int xen_block_release(struct inode *inode, struct file *filep) */ if ( --disk->usage == 0 ) { +#if 0 update_tq.routine = update_vbds_task; schedule_task(&update_tq); +#endif } return 0; } -int xen_block_ioctl(struct inode *inode, struct file *filep, +int blkif_ioctl(struct inode *inode, struct file *filep, unsigned command, unsigned long argument) { kdev_t dev = inode->i_rdev; @@ -170,7 +163,7 @@ int xen_block_ioctl(struct inode *inode, struct file *filep, case BLKRRPART: /* re-read partition table */ DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART); - return xen_block_revalidate(dev); + return blkif_revalidate(dev); case BLKSSZGET: return hardsect_size[MAJOR(dev)][MINOR(dev)]; @@ -218,11 +211,11 @@ int xen_block_ioctl(struct inode *inode, struct file *filep, return 0; case SCSI_IOCTL_GET_BUS_NUMBER: - DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in Xen blkdev"); + DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif"); return -ENOSYS; default: - printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", command); + printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command); return -ENOSYS; } @@ -230,13 +223,13 @@ int xen_block_ioctl(struct inode *inode, struct file *filep, } /* check media change: should probably do something here in some cases :-) */ -int xen_block_check(kdev_t dev) +int blkif_check(kdev_t dev) { - DPRINTK("xen_block_check\n"); + DPRINTK("blkif_check\n"); return 0; } -int xen_block_revalidate(kdev_t dev) +int blkif_revalidate(kdev_t dev) { struct block_device *bd; struct gendisk *gd; @@ -289,25 +282,25 @@ int xen_block_revalidate(kdev_t dev) /* - * hypervisor_request + * blkif_queue_request * * request block io * * id: for guest use only. - * operation: XEN_BLOCK_{READ,WRITE,PROBE,VBD*} + * operation: BLKIF_OP_{READ,WRITE,PROBE} * buffer: buffer to read/write into. this should be a * virtual address in the guest os. */ -static int hypervisor_request(unsigned long id, - int operation, - char * buffer, - unsigned long sector_number, - unsigned short nr_sectors, - kdev_t device) +static int blkif_queue_request(unsigned long id, + int operation, + char * buffer, + unsigned long sector_number, + unsigned short nr_sectors, + kdev_t device) { - unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); - struct gendisk *gd; - blk_ring_req_entry_t *req; + unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); + struct gendisk *gd; + blkif_request_t *req; struct buffer_head *bh; if ( unlikely(nr_sectors >= (1<<9)) ) @@ -315,26 +308,26 @@ static int hypervisor_request(unsigned long id, if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) BUG(); - if ( unlikely(state == STATE_CLOSED) ) + if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) return 1; switch ( operation ) { - case XEN_BLOCK_READ: - case XEN_BLOCK_WRITE: + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: gd = get_gendisk(device); /* * Update the sector_number we'll pass down as appropriate; note that * we could sanity check that resulting sector will be in this - * partition, but this will happen in xen anyhow. + * partition, but this will happen in driver backend anyhow. */ sector_number += gd->part[MINOR(device)].start_sect; /* - * If this unit doesn't consist of virtual (i.e., Xen-specified) - * partitions then we clear the partn bits from the device number. + * If this unit doesn't consist of virtual partitions then we clear + * the partn bits from the device number. */ if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & GENHD_FL_VIRT_PARTNS) ) @@ -375,7 +368,7 @@ static int hypervisor_request(unsigned long id, req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req; req->id = id; req->operation = operation; - req->sector_number = (xen_sector_t)sector_number; + req->sector_number = (blkif_sector_t)sector_number; req->device = device; req->nr_segments = 1; req->buffer_and_sects[0] = buffer_ma | nr_sectors; @@ -386,23 +379,23 @@ static int hypervisor_request(unsigned long id, /* - * do_xlblk_request + * do_blkif_request * read a block; request is in a request queue */ -void do_xlblk_request(request_queue_t *rq) +void do_blkif_request(request_queue_t *rq) { struct request *req; struct buffer_head *bh, *next_bh; int rw, nsect, full, queued = 0; - DPRINTK("xlblk.c::do_xlblk_request\n"); + DPRINTK("Entered do_blkif_request\n"); while ( !rq->plugged && !list_empty(&rq->queue_head)) { if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) goto out; - DPRINTK("do_xlblk_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n", + DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n", req, req->cmd, req->sector, req->current_nr_sectors, req->nr_sectors, req->bh); @@ -420,9 +413,9 @@ void do_xlblk_request(request_queue_t *rq) next_bh = bh->b_reqnext; bh->b_reqnext = NULL; - full = hypervisor_request( + full = blkif_queue_request( (unsigned long)bh, - (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, + (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev); if ( full ) @@ -462,7 +455,8 @@ void do_xlblk_request(request_queue_t *rq) } out: - if ( queued != 0 ) signal_requests_to_xen(); + if ( queued != 0 ) + flush_requests(); } @@ -474,30 +468,30 @@ static void kick_pending_request_queues(void) { /* Attempt to drain the queue, but bail if the ring becomes full. */ while ( (nr_pending != 0) && !RING_PLUGGED ) - do_xlblk_request(pending_queues[--nr_pending]); + do_blkif_request(pending_queues[--nr_pending]); } } -static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) +static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) { BLK_RING_IDX i; unsigned long flags; struct buffer_head *bh, *next_bh; - if ( unlikely(state == STATE_CLOSED) ) + if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) ) return; spin_lock_irqsave(&io_request_lock, flags); for ( i = resp_cons; i != blk_ring->resp_prod; i++ ) { - blk_ring_resp_entry_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp; + blkif_response_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp; switch ( bret->operation ) { - case XEN_BLOCK_READ: - case XEN_BLOCK_WRITE: - if ( unlikely(bret->status != 0) ) + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) DPRINTK("Bad return from blkdev data request: %lx\n", bret->status); for ( bh = (struct buffer_head *)bret->id; @@ -506,10 +500,13 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) { next_bh = bh->b_reqnext; bh->b_reqnext = NULL; - bh->b_end_io(bh, !bret->status); + bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY); } break; - + case BLKIF_OP_PROBE: + memcpy(&blkif_control_rsp, bret, sizeof(*bret)); + blkif_control_rsp_valid = 1; + break; default: BUG(); } @@ -523,70 +520,190 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) } -static void reset_xlblk_interface(void) +void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp) { - block_io_op_t op; + unsigned long flags; - nr_pending = 0; + retry: + while ( (req_prod - resp_cons) == BLK_RING_SIZE ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } - op.cmd = BLOCK_IO_OP_RESET; - if ( HYPERVISOR_block_io_op(&op) != 0 ) - printk(KERN_ALERT "Possible blkdev trouble: couldn't reset ring\n"); + spin_lock_irqsave(&io_request_lock, flags); + if ( (req_prod - resp_cons) == BLK_RING_SIZE ) + { + spin_unlock_irqrestore(&io_request_lock, flags); + goto retry; + } - op.cmd = BLOCK_IO_OP_RING_ADDRESS; - (void)HYPERVISOR_block_io_op(&op); + DISABLE_SCATTERGATHER(); + memcpy(&blk_ring->ring[MASK_BLK_IDX(req_prod)].req, req, sizeof(*req)); + req_prod++; + flush_requests(); - set_fixmap(FIX_BLKRING_BASE, op.u.ring_mfn << PAGE_SHIFT); - blk_ring = (blk_ring_t *)fix_to_virt(FIX_BLKRING_BASE); - blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; + spin_unlock_irqrestore(&io_request_lock, flags); - wmb(); - state = STATE_ACTIVE; + while ( !blkif_control_rsp_valid ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + memcpy(rsp, &blkif_control_rsp, sizeof(*rsp)); + blkif_control_rsp_valid = 0; } -int __init xlblk_init(void) +static void blkif_bringup_phase1(void *unused) { - int error; + ctrl_msg_t cmsg; + blkif_fe_interface_connect_t up; + + /* Move from CLOSED to DISCONNECTED state. */ + blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; + blkif_state = BLKIF_STATE_DISCONNECTED; + + /* Construct an interface-CONNECT message for the domain controller. */ + cmsg.type = CMSG_BLKIF_FE; + cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; + cmsg.length = sizeof(blkif_fe_interface_connect_t); + up.handle = 0; + up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; + memcpy(cmsg.msg, &up, sizeof(up)); + + /* Tell the controller to bring up the interface. */ + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} + +static void blkif_bringup_phase2(void *unused) +{ + blkif_irq = bind_evtchn_to_irq(blkif_evtchn); + (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); + + /* Probe for discs that are attached to the interface. */ + xlvbd_init(); - reset_xlblk_interface(); + blkif_state = BLKIF_STATE_CONNECTED; - xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV); - xlblk_update_irq = bind_virq_to_irq(VIRQ_VBD_UPD); + /* Kick pending requests. */ + spin_lock_irq(&io_request_lock); + kick_pending_request_queues(); + spin_unlock_irq(&io_request_lock); +} - error = request_irq(xlblk_response_irq, xlblk_response_int, - SA_SAMPLE_RANDOM, "blkdev", NULL); - if ( error ) +static void blkif_status_change(blkif_fe_interface_status_changed_t *status) +{ + if ( status->handle != 0 ) { - printk(KERN_ALERT "Could not allocate receive interrupt\n"); - goto fail; + printk(KERN_WARNING "Status change on unsupported blkif %d\n", + status->handle); + return; + } + + switch ( status->status ) + { + case BLKIF_INTERFACE_STATUS_DESTROYED: + printk(KERN_WARNING "Unexpected blkif-DESTROYED message in state %d\n", + blkif_state); + break; + + case BLKIF_INTERFACE_STATUS_DISCONNECTED: + if ( blkif_state != BLKIF_STATE_CLOSED ) + { + printk(KERN_WARNING "Unexpected blkif-DISCONNECTED message" + " in state %d\n", blkif_state); + break; + } + blkif_statechange_tq.routine = blkif_bringup_phase1; + schedule_task(&blkif_statechange_tq); + break; + + case BLKIF_INTERFACE_STATUS_CONNECTED: + if ( blkif_state == BLKIF_STATE_CLOSED ) + { + printk(KERN_WARNING "Unexpected blkif-CONNECTED message" + " in state %d\n", blkif_state); + break; + } + blkif_evtchn = status->evtchn; + blkif_statechange_tq.routine = blkif_bringup_phase2; + schedule_task(&blkif_statechange_tq); + break; + + default: + printk(KERN_WARNING "Status change to unknown value %d\n", + status->status); + break; } +} - error = request_irq(xlblk_update_irq, xlblk_update_int, - 0, "blkdev", NULL); - if ( error ) +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) { - printk(KERN_ALERT "Could not allocate block update interrupt\n"); - goto fail; + case CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED: + if ( msg->length != sizeof(blkif_fe_interface_status_changed_t) ) + goto parse_error; + blkif_status_change((blkif_fe_interface_status_changed_t *) + &msg->msg[0]); + break; +#if 0 + case CMSG_BLKIF_FE_VBD_STATUS_CHANGED: + update_tq.routine = update_vbds_task; + schedule_task(&update_tq); + break; +#endif + default: + goto parse_error; } - (void)xlvbd_init(); + ctrl_if_send_response(msg); + return; - return 0; + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} - fail: - return error; + +int __init xlblk_init(void) +{ + ctrl_msg_t cmsg; + blkif_fe_driver_status_changed_t st; + + (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_BLKIF_FE; + cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(blkif_fe_driver_status_changed_t); + st.status = BLKIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + + /* + * We should read 'nr_interfaces' from response message and wait + * for notifications before proceeding. For now we assume that we + * will be notified of exactly one interface. + */ + while ( blkif_state != BLKIF_STATE_CONNECTED ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + return 0; } static void __exit xlblk_cleanup(void) { - xlvbd_cleanup(); - free_irq(xlblk_response_irq, NULL); - free_irq(xlblk_update_irq, NULL); - unbind_virq_from_irq(VIRQ_BLKDEV); - unbind_virq_from_irq(VIRQ_VBD_UPD); + /* XXX FIXME */ + BUG(); } @@ -598,28 +715,13 @@ module_exit(xlblk_cleanup); void blkdev_suspend(void) { - state = STATE_SUSPENDED; - wmb(); - - while ( resp_cons != blk_ring->req_prod ) - { - barrier(); - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(1); - } - - wmb(); - state = STATE_CLOSED; - wmb(); - - clear_fixmap(FIX_BLKRING_BASE); + /* XXX FIXME */ + BUG(); } void blkdev_resume(void) { - reset_xlblk_interface(); - spin_lock_irq(&io_request_lock); - kick_pending_request_queues(); - spin_unlock_irq(&io_request_lock); + /* XXX FIXME */ + BUG(); } diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c index e08b976c56..b26907192a 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c @@ -1,13 +1,13 @@ /****************************************************************************** - * vbd.c + * arch/xen/drivers/blkif/frontend/vbd.c * - * Xenolinux virtual block-device driver (xvd). + * Xenolinux virtual block-device driver. * * Copyright (c) 2003-2004, Keir Fraser & Steve Hand * Modifications by Mark A. Williamson are (c) Intel Research Cambridge */ -#include "block.h" +#include "common.h" #include <linux/blk.h> /* @@ -43,54 +43,59 @@ static int xlvbd_blksize_size[256]; static int xlvbd_hardsect_size[256]; static int xlvbd_max_sectors[256]; -/* Information from Xen about our VBDs. */ +/* Information about our VBDs. */ #define MAX_VBDS 64 static int nr_vbds; -static xen_disk_t *vbd_info; +static vdisk_t *vbd_info; static struct block_device_operations xlvbd_block_fops = { - open: xen_block_open, - release: xen_block_release, - ioctl: xen_block_ioctl, - check_media_change: xen_block_check, - revalidate: xen_block_revalidate, + open: blkif_open, + release: blkif_release, + ioctl: blkif_ioctl, + check_media_change: blkif_check, + revalidate: blkif_revalidate, }; -static int xlvbd_get_vbd_info(xen_disk_t *disk_info) +static int xlvbd_get_vbd_info(vdisk_t *disk_info) { - int error; - block_io_op_t op; - - /* Probe for disk information. */ - memset(&op, 0, sizeof(op)); - op.cmd = BLOCK_IO_OP_VBD_PROBE; - op.u.probe_params.domain = 0; - op.u.probe_params.xdi.max = MAX_VBDS; - op.u.probe_params.xdi.disks = disk_info; - op.u.probe_params.xdi.count = 0; - - if ( (error = HYPERVISOR_block_io_op(&op)) != 0 ) + vdisk_t *buf = (vdisk_t *)__get_free_page(GFP_KERNEL); + blkif_request_t req; + blkif_response_t rsp; + int nr; + + memset(&req, 0, sizeof(req)); + req.operation = BLKIF_OP_PROBE; + req.nr_segments = 1; + req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512); + + blkif_control_send(&req, &rsp); + + if ( rsp.status <= 0 ) { - printk(KERN_ALERT "Could not probe disks (%d)\n", error); + printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status); return -1; } - return op.u.probe_params.xdi.count; + if ( (nr = rsp.status) > MAX_VBDS ) + nr = MAX_VBDS; + memcpy(disk_info, buf, nr * sizeof(vdisk_t)); + + return nr; } /* * xlvbd_init_device - initialise a VBD device - * @disk: a xen_disk_t describing the VBD + * @disk: a vdisk_t describing the VBD * - * Takes a xen_disk_t * that describes a VBD the domain has access to. + * Takes a vdisk_t * that describes a VBD the domain has access to. * Performs appropriate initialisation and registration of the device. * * Care needs to be taken when making re-entrant calls to ensure that * corruption does not occur. Also, devices that are in use should not have * their details updated. This is the caller's responsibility. */ -static int xlvbd_init_device(xen_disk_t *xd) +static int xlvbd_init_device(vdisk_t *xd) { int device = xd->device; int major = MAJOR(device); @@ -181,11 +186,11 @@ static int xlvbd_init_device(xen_disk_t *xd) read_ahead[major] = 8; } - blk_init_queue(BLK_DEFAULT_QUEUE(major), do_xlblk_request); + blk_init_queue(BLK_DEFAULT_QUEUE(major), do_blkif_request); /* * Turn off barking 'headactive' mode. We dequeue buffer heads as - * soon as we pass them down to Xen. + * soon as we pass them to the back-end driver. */ blk_queue_headactive(BLK_DEFAULT_QUEUE(major), 0); @@ -431,12 +436,12 @@ static int xlvbd_remove_device(int device) void xlvbd_update_vbds(void) { int i, j, k, old_nr, new_nr; - xen_disk_t *old_info, *new_info, *merged_info; + vdisk_t *old_info, *new_info, *merged_info; old_info = vbd_info; old_nr = nr_vbds; - new_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL); + new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 ) { kfree(new_info); @@ -448,7 +453,7 @@ void xlvbd_update_vbds(void) * old list and new list do not overlap at all, and we cannot yet destroy * VBDs in the old list because the usage counts are busy. */ - merged_info = kmalloc((old_nr + new_nr) * sizeof(xen_disk_t), GFP_KERNEL); + merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL); /* @i tracks old list; @j tracks new list; @k tracks merged list. */ i = j = k = 0; @@ -458,13 +463,13 @@ void xlvbd_update_vbds(void) if ( old_info[i].device < new_info[j].device ) { if ( xlvbd_remove_device(old_info[i].device) != 0 ) - memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t)); + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); i++; } else if ( old_info[i].device > new_info[j].device ) { if ( xlvbd_init_device(&new_info[j]) == 0 ) - memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t)); + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); j++; } else @@ -472,9 +477,9 @@ void xlvbd_update_vbds(void) if ( ((old_info[i].capacity == new_info[j].capacity) && (old_info[i].info == new_info[j].info)) || (xlvbd_remove_device(old_info[i].device) != 0) ) - memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t)); + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); else if ( xlvbd_init_device(&new_info[j]) == 0 ) - memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t)); + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); i++; j++; } } @@ -482,13 +487,13 @@ void xlvbd_update_vbds(void) for ( ; i < old_nr; i++ ) { if ( xlvbd_remove_device(old_info[i].device) != 0 ) - memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t)); + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); } for ( ; j < new_nr; j++ ) { if ( xlvbd_init_device(&new_info[j]) == 0 ) - memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t)); + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); } vbd_info = merged_info; @@ -500,14 +505,14 @@ void xlvbd_update_vbds(void) /* - * Set up all the linux device goop for the virtual block devices (vbd's) that - * xen tells us about. Note that although from xen's pov VBDs are addressed - * simply an opaque 16-bit device number, the domain creation tools + * Set up all the linux device goop for the virtual block devices (vbd's) that + * we know about. Note that although from the backend driver's p.o.v. VBDs are + * addressed simply an opaque 16-bit device number, the domain creation tools * conventionally allocate these numbers to correspond to those used by 'real' * linux -- this is just for convenience as it means e.g. that the same - * /etc/fstab can be used when booting with or without xen. + * /etc/fstab can be used when booting with or without Xen. */ -int __init xlvbd_init(void) +int xlvbd_init(void) { int i; @@ -537,7 +542,7 @@ int __init xlvbd_init(void) xlvbd_max_sectors[i] = 128; } - vbd_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL); + vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); nr_vbds = xlvbd_get_vbd_info(vbd_info); if ( nr_vbds < 0 ) @@ -554,8 +559,3 @@ int __init xlvbd_init(void) return 0; } - - -#ifdef MODULE -module_init(xlvbd_init); -#endif diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c index 4e507081be..98eff63453 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c @@ -36,7 +36,7 @@ static struct proc_dir_entry *privcmd_intf; static int privcmd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long data) { - int ret = 0; + int ret = -ENOSYS; switch ( cmd ) { @@ -108,6 +108,13 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, } break; + case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN: + { + extern int initdom_ctrlif_domcontroller_port; + ret = initdom_ctrlif_domcontroller_port; + } + break; + default: ret = -EINVAL; break; @@ -140,7 +147,7 @@ static int __init init_module(void) { privcmd_intf->owner = THIS_MODULE; privcmd_intf->nlink = 1; - privcmd_intf->proc_fops = &privcmd_file_ops; + privcmd_intf->proc_fops = &privcmd_file_ops; } return 0; diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c index 7d59ad2e16..715f707eb0 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c @@ -17,6 +17,13 @@ #include <asm/ctrl_if.h> #include <asm/evtchn.h> +/* + * Only used by initial domain which must create its own control-interface + * event channel. This value is picked up by the user-space domain controller + * via an ioctl. + */ +int initdom_ctrlif_domcontroller_port = -1; + static int ctrl_if_evtchn; static int ctrl_if_irq; static spinlock_t ctrl_if_lock; @@ -276,9 +283,6 @@ void ctrl_if_unregister_receiver(u8 type, ctrl_msg_handler_t hnd) void ctrl_if_suspend(void) { - if ( start_info.flags & SIF_INITDOMAIN ) - return; - free_irq(ctrl_if_irq, NULL); unbind_evtchn_from_irq(ctrl_if_evtchn); } @@ -286,7 +290,21 @@ void ctrl_if_suspend(void) void ctrl_if_resume(void) { if ( start_info.flags & SIF_INITDOMAIN ) - return; + { + /* + * The initial domain must create its own domain-controller link. + * The controller is probably not running at this point, but will + * pick up its end of the event channel from + */ + evtchn_op_t op; + op.cmd = EVTCHNOP_bind_interdomain; + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = DOMID_SELF; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + BUG(); + start_info.domain_controller_evtchn = op.u.bind_interdomain.port1; + initdom_ctrlif_domcontroller_port = op.u.bind_interdomain.port2; + } ctrl_if_tx_resp_cons = 0; ctrl_if_rx_req_cons = 0; diff --git a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c index 7b4d1ff0a9..773a1f83c3 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c @@ -5,7 +5,7 @@ * * (C) Copyright 1995 1996 Linus Torvalds * - * Modifications for Xenolinux (c) 2003 Keir Fraser + * Modifications for Xenolinux (c) 2003-2004 Keir Fraser */ #include <linux/slab.h> @@ -28,21 +28,26 @@ __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) static inline int direct_remap_area_pte(pte_t *pte, - unsigned long address, - unsigned long size, - unsigned long machine_addr, - pgprot_t prot, - domid_t domid) + unsigned long address, + unsigned long size, + unsigned long machine_addr, + pgprot_t prot, + domid_t domid) { unsigned long end; +#define MAX_DIRECTMAP_MMU_QUEUE 64 + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v; - mmu_update_t *u, *v; - u = v = vmalloc(3*PAGE_SIZE); /* plenty */ - - if (!u) - return -ENOMEM; + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + if (address >= end) + BUG(); + reset_buffer: /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */ + v = &u[0]; if ( domid != 0 ) { v[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL; @@ -56,12 +61,6 @@ static inline int direct_remap_area_pte(pte_t *pte, v += 2; } - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - if (address >= end) - BUG(); do { #if 0 /* thanks to new ioctl mmaping interface this is no longer a bug */ if (!pte_none(*pte)) { @@ -71,7 +70,12 @@ static inline int direct_remap_area_pte(pte_t *pte, #endif v->ptr = virt_to_machine(pte); v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO; - v++; + if ( ++v == MAX_DIRECTMAP_MMU_QUEUE ) + { + if ( HYPERVISOR_mmu_update(u, MAX_DIRECTMAP_MMU_QUEUE) < 0 ) + return -EFAULT; + goto reset_buffer; + } address += PAGE_SIZE; machine_addr += PAGE_SIZE; pte++; @@ -84,7 +88,6 @@ static inline int direct_remap_area_pte(pte_t *pte, return -EINVAL; } - vfree(u); return 0; } @@ -96,8 +99,8 @@ static inline int direct_remap_area_pmd(struct mm_struct *mm, pgprot_t prot, domid_t domid) { + int error = 0; unsigned long end; - int rc; address &= ~PGDIR_MASK; end = address + size; @@ -111,14 +114,14 @@ static inline int direct_remap_area_pmd(struct mm_struct *mm, if (!pte) return -ENOMEM; - if ( rc = direct_remap_area_pte(pte, address, end - address, - address + machine_addr, prot, domid) ) - return rc; - + error = direct_remap_area_pte(pte, address, end - address, + address + machine_addr, prot, domid); + if ( error ) + break; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); - return 0; + return error; } int direct_remap_area_pages(struct mm_struct *mm, diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h index f1d2b77c2e..a02e2471ea 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h @@ -52,7 +52,7 @@ int ctrl_if_send_message_noblock( * function returns. * 2. If @hnd is NULL then no callback is executed. */ -int ctrl_if_send_message( +int ctrl_if_send_message_block( ctrl_msg_t *msg, ctrl_msg_handler_t hnd, unsigned long id, diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/io.h b/xenolinux-2.4.26-sparse/include/asm-xen/io.h index 3d78e20950..f5243bb6a7 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/io.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/io.h @@ -159,13 +159,47 @@ extern void iounmap(void *addr); extern void *bt_ioremap(unsigned long offset, unsigned long size); extern void bt_iounmap(void *addr, unsigned long size); +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + +#ifdef CONFIG_HIGHMEM +#error "Highmem is not yet compatible with physical device access" +#endif + /* - * IO bus memory addresses are also 1:1 with the physical address + * The bus translation macros need special care if we are executing device + * accesses to/from other domains' memory. In these cases the virtual address + * is actually a temporary mapping in the 'vmalloc' space. The physical + * address will therefore be >max_low_pfn, and will not have a valid entry + * in the phys_to_mach mapping table. */ +static inline unsigned long phys_to_bus(unsigned long phys) +{ + extern unsigned long max_pfn; + pgd_t *pgd; pmd_t *pmd; pte_t *pte; + void *addr; + unsigned long bus; + if ( (phys >> PAGE_SHIFT) < max_pfn ) + return phys_to_machine(phys); + addr = phys_to_virt(phys); + pgd = pgd_offset_k( (unsigned long)addr); + pmd = pmd_offset(pgd, (unsigned long)addr); + pte = pte_offset(pmd, (unsigned long)addr); + bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK); + return bus; +} + +#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x)) +#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) +#define page_to_bus(_x) phys_to_bus(page_to_phys(_x)) + +#else + #define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x)) #define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) #define page_to_bus(_x) phys_to_machine(page_to_phys(_x)) +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ + /* * readX/writeX() are used to access memory mapped devices. On some * architectures the memory mapped IO stuff needs to be accessed diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h b/xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h index c780f644c0..162ba1fbed 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h @@ -47,6 +47,11 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) return (pmd_t *) dir; } +#define pte_same(a, b) ((a).pte_low == (b).pte_low) +#define pte_page(x) (mem_map+((unsigned long)((pte_val(x) >> PAGE_SHIFT)))) +#define pte_none(x) (!(x).pte_low) +#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) + /* * A note on implementation of this atomic 'get-and-clear' operation. * This is actually very simple because XenoLinux can only run on a single @@ -59,13 +64,9 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) static inline pte_t ptep_get_and_clear(pte_t *xp) { pte_t pte = *xp; - queue_l1_entry_update(xp, 0); + if ( !pte_none(pte) ) + queue_l1_entry_update(xp, 0); return pte; } -#define pte_same(a, b) ((a).pte_low == (b).pte_low) -#define pte_page(x) (mem_map+((unsigned long)((pte_val(x) >> PAGE_SHIFT)))) -#define pte_none(x) (!(x).pte_low) -#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) - #endif /* _I386_PGTABLE_2LEVEL_H */ diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h b/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h index 30cec9aff5..3bf03c6064 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h @@ -32,10 +32,22 @@ typedef struct privcmd_blkmsg int buf_size; } privcmd_blkmsg_t; -#define IOCTL_PRIVCMD_HYPERCALL \ +/* + * @cmd: IOCTL_PRIVCMD_HYPERCALL + * @arg: &privcmd_hypercall_t + * Return: Value returned from execution of the specified hypercall. + */ +#define IOCTL_PRIVCMD_HYPERCALL \ _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t)) -#define IOCTL_PRIVCMD_BLKMSG \ - _IOC(_IOC_NONE, 'P', 1, sizeof(privcmd_blkmsg_t)) + +/* + * @cmd: IOCTL_PRIVCMD_INITDOMAIN_EVTCHN + * @arg: n/a + * Return: Port associated with domain-controller end of control event channel + * for the initial domain. + */ +#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN \ + _IOC(_IOC_NONE, 'P', 1, 0) #define IOCTL_PRIVCMD_MMAP \ _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) diff --git a/xenolinux-2.4.26-sparse/mm/vmalloc.c b/xenolinux-2.4.26-sparse/mm/vmalloc.c index 4d583b54a7..b030270b42 100644 --- a/xenolinux-2.4.26-sparse/mm/vmalloc.c +++ b/xenolinux-2.4.26-sparse/mm/vmalloc.c @@ -45,6 +45,10 @@ static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned lo continue; if (pte_present(page)) { struct page *ptpage = pte_page(page); +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) + if (pte_io(page)) + continue; +#endif if (VALID_PAGE(ptpage) && (!PageReserved(ptpage))) __free_page(ptpage); continue; @@ -250,11 +254,6 @@ void __vfree(void * addr, int free_area_pages) for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { if (tmp->addr == addr) { *p = tmp->next; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (tmp->flags & VM_IOREMAP) - zap_page_range(&init_mm, VMALLOC_VMADDR(tmp->addr), tmp->size); - else -#endif if (free_area_pages) vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size); write_unlock(&vmlist_lock); |