diff options
Diffstat (limited to 'linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend')
6 files changed, 1390 insertions, 0 deletions
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile new file mode 100644 index 0000000000..4c8c17367c --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := main.o control.o interface.o vbd.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h new file mode 100644 index 0000000000..d9f1d22908 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h @@ -0,0 +1,108 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/common.h + */ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/rbtree.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <asm/ctrl_if.h> +#include <asm/io.h> +#include "../blkif.h" + +#ifndef NDEBUG +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned long shmem_frame; + unsigned int evtchn; + int irq; + /* Comms information. */ + blkif_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */ + BLK_RING_IDX blk_req_cons; /* Request consumer. */ + BLK_RING_IDX blk_resp_prod; /* Private version of response producer. */ + /* VBDs attached to this interface. */ + rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */ + spinlock_t vbd_lock; /* Protects VBD mapping. */ + /* Private fields. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + /* + * DISCONNECT response is deferred until pending requests are ack'ed. + * We therefore need to store the id from the original request. + */ + u8 disconnect_rspid; + struct blkif_st *hash_next; + struct list_head blkdev_list; + spinlock_t blk_ring_lock; + atomic_t refcnt; +} blkif_t; + +void blkif_create(blkif_be_create_t *create); +void blkif_destroy(blkif_be_destroy_t *destroy); +void blkif_connect(blkif_be_connect_t *connect); +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id); +void __blkif_disconnect_complete(blkif_t *blkif); +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + __blkif_disconnect_complete(_b); \ + } while (0) + +/* An entry in a list of xen_extents. */ +typedef struct _blkif_extent_le { + blkif_extent_t extent; /* an individual extent */ + struct _blkif_extent_le *next; /* and a pointer to the next */ +} blkif_extent_le_t; + +typedef struct _vbd { + blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* XD_TYPE_xxx */ + blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */ + rb_node_t rb; /* for linking into R-B tree lookup struct */ +} vbd_t; + +void vbd_create(blkif_be_vbd_create_t *create); +void vbd_grow(blkif_be_vbd_grow_t *grow); +void vbd_shrink(blkif_be_vbd_shrink_t *shrink); +void vbd_destroy(blkif_be_vbd_destroy_t *delete); +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds); +void destroy_all_vbds(blkif_t *blkif); + +/* Describes a [partial] disk extent (part of a block io request) */ +typedef struct { + unsigned short dev; + unsigned short nr_sects; + unsigned long buffer; + xen_sector_t sector_number; +} phys_seg_t; + +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); + +void blkif_interface_init(void); +void blkif_ctrlif_init(void); + +void blkif_deschedule(blkif_t *blkif); + +void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c new file mode 100644 index 0000000000..0b26224651 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c @@ -0,0 +1,87 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/control.c + * + * Routines for interfacing with the control plane. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype); + + switch ( msg->subtype ) + { + case CMSG_BLKIF_BE_CREATE: + if ( msg->length != sizeof(blkif_be_create_t) ) + goto parse_error; + blkif_create((blkif_be_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DESTROY: + if ( msg->length != sizeof(blkif_be_destroy_t) ) + goto parse_error; + blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_CONNECT: + if ( msg->length != sizeof(blkif_be_connect_t) ) + goto parse_error; + blkif_connect((blkif_be_connect_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DISCONNECT: + if ( msg->length != sizeof(blkif_be_disconnect_t) ) + goto parse_error; + if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) ) + return; /* Sending the response is deferred until later. */ + break; + case CMSG_BLKIF_BE_VBD_CREATE: + if ( msg->length != sizeof(blkif_be_vbd_create_t) ) + goto parse_error; + vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_DESTROY: + if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) + goto parse_error; + vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_GROW: + if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) + goto parse_error; + vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_SHRINK: + if ( msg->length != sizeof(blkif_be_vbd_shrink_t) ) + goto parse_error; + vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]); + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + DPRINTK("Parse error while reading message subtype %d, len %d\n", + msg->subtype, msg->length); + msg->length = 0; + ctrl_if_send_response(msg); +} + +void blkif_ctrlif_init(void) +{ + ctrl_msg_t cmsg; + blkif_be_driver_status_changed_t st; + + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(blkif_be_driver_status_changed_t); + st.status = BLKIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c new file mode 100644 index 0000000000..780d793c6c --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c @@ -0,0 +1,233 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) + +static kmem_cache_t *blkif_cachep; +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + ((blkif->domid != domid) || (blkif->handle != handle)) ) + blkif = blkif->hash_next; + return blkif; +} + +void __blkif_disconnect_complete(blkif_t *blkif) +{ + ctrl_msg_t cmsg; + blkif_be_disconnect_t disc; + + /* + * These can't be done in __blkif_disconnect() because at that point there + * may be outstanding requests at the disc whose asynchronous responses + * must still be notified to the remote driver. + */ + unbind_evtchn_from_irq(blkif->evtchn); + vfree(blkif->blk_ring_base); + + /* Construct the deferred response message. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; + cmsg.id = blkif->disconnect_rspid; + cmsg.length = sizeof(blkif_be_disconnect_t); + disc.domid = blkif->domid; + disc.blkif_handle = blkif->handle; + disc.status = BLKIF_BE_STATUS_OKAY; + memcpy(cmsg.msg, &disc, sizeof(disc)); + + /* + * Make sure message is constructed /before/ status change, because + * after the status change the 'blkif' structure could be deallocated at + * any time. Also make sure we send the response /after/ status change, + * as otherwise a subsequent CONNECT request could spuriously fail if + * another CPU doesn't see the status change yet. + */ + mb(); + if ( blkif->status != DISCONNECTING ) + BUG(); + blkif->status = DISCONNECTED; + mb(); + + /* Send the successful response. */ + ctrl_if_send_response(&cmsg); +} + +void blkif_create(blkif_be_create_t *create) +{ + domid_t domid = create->domid; + unsigned int handle = create->blkif_handle; + blkif_t **pblkif, *blkif; + + if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) + { + DPRINTK("Could not create blkif: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->handle = handle; + blkif->status = DISCONNECTED; + spin_lock_init(&blkif->vbd_lock); + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 0); + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + { + DPRINTK("Could not create blkif: already exists\n"); + create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; + kmem_cache_free(blkif_cachep, blkif); + return; + } + pblkif = &(*pblkif)->hash_next; + } + + blkif->hash_next = *pblkif; + *pblkif = blkif; + + DPRINTK("Successfully created blkif\n"); + create->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_destroy(blkif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->blkif_handle; + blkif_t **pblkif, *blkif; + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif = *pblkif) != NULL ) + { + if ( (blkif->domid == domid) && (blkif->handle == handle) ) + { + if ( blkif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pblkif = &blkif->hash_next; + } + + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pblkif = blkif->hash_next; + destroy_all_vbds(blkif); + kmem_cache_free(blkif_cachep, blkif); + destroy->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_connect(blkif_be_connect_t *connect) +{ + domid_t domid = connect->domid; + unsigned int handle = connect->blkif_handle; + unsigned int evtchn = connect->evtchn; + unsigned long shmem_frame = connect->shmem_frame; + struct vm_struct *vma; + pgprot_t prot; + int error; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", + connect->domid, connect->blkif_handle); + connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) + { + connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); + error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), + shmem_frame<<PAGE_SHIFT, PAGE_SIZE, + prot, domid); + if ( error != 0 ) + { + if ( error == -ENOMEM ) + connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + else if ( error == -EFAULT ) + connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; + else + connect->status = BLKIF_BE_STATUS_ERROR; + vfree(vma->addr); + return; + } + + if ( blkif->status != DISCONNECTED ) + { + connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + vfree(vma->addr); + return; + } + + blkif->evtchn = evtchn; + blkif->irq = bind_evtchn_to_irq(evtchn); + blkif->shmem_frame = shmem_frame; + blkif->blk_ring_base = (blkif_ring_t *)vma->addr; + blkif->status = CONNECTED; + blkif_get(blkif); + + request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif); + + connect->status = BLKIF_BE_STATUS_OKAY; +} + +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id) +{ + domid_t domid = disconnect->domid; + unsigned int handle = disconnect->blkif_handle; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_disconnect attempted for non-existent blkif" + " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); + disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return 1; /* Caller will send response error message. */ + } + + if ( blkif->status == CONNECTED ) + { + blkif->status = DISCONNECTING; + blkif->disconnect_rspid = rsp_id; + wmb(); /* Let other CPUs see the status change. */ + free_irq(blkif->irq, NULL); + blkif_deschedule(blkif); + blkif_put(blkif); + } + + return 0; /* Caller should not send response message. */ +} + +void __init blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); + memset(blkif_hash, 0, sizeof(blkif_hash)); +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c new file mode 100644 index 0000000000..803af976d2 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c @@ -0,0 +1,523 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/blkif/frontend + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW ** + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +#define MAX_PENDING_REQS 64 +#define BATCH_PER_DOMAIN 16 + +/* + * NB. We place a page of padding between each buffer page to avoid incorrect + * merging of requests by the IDE and SCSI merging routines. Otherwise, two + * adjacent buffers in a scatter-gather request would have adjacent page + * numbers: since the merge routines don't realise that this is in *pseudophys* + * space, not real space, they may collapse the s-g elements! + */ +static unsigned long mmap_vstart; +#define MMAP_PAGES_PER_REQUEST \ + (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)) +#define MMAP_PAGES \ + (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) +#define MMAP_VADDR(_req,_seg) \ + (mmap_vstart + \ + ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * 2 * PAGE_SIZE)) + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + unsigned long id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; +} pending_req_t; + +/* + * We can't allocate pending_req's in order, since they may complete out of + * order. We therefore maintain an allocation ring. This ring also indicates + * when enough work has been passed down -- at that point the allocation ring + * will be empty. + */ +static pending_req_t pending_reqs[MAX_PENDING_REQS]; +static unsigned char pending_ring[MAX_PENDING_REQS]; +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; +/* NB. We use a different index type to differentiate from shared blk rings. */ +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +static kmem_cache_t *buffer_head_cachep; + +static int do_block_io_op(blkif_t *blkif, int max_to_do); +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req); +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req); +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st); + +static void fast_flush_area(int idx, int nr_pages) +{ + multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; + int i; + + for ( i = 0; i < nr_pages; i++ ) + { + mcl[i].op = __HYPERVISOR_update_va_mapping; + mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; + mcl[i].args[1] = 0; + mcl[i].args[2] = 0; + } + + mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB; + (void)HYPERVISOR_multicall(mcl, nr_pages); +} + + +/****************************************************************** + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE + */ + +static struct list_head io_schedule_list; +static spinlock_t io_schedule_list_lock; + +static int __on_blkdev_list(blkif_t *blkif) +{ + return blkif->blkdev_list.next != NULL; +} + +static void remove_from_blkdev_list(blkif_t *blkif) +{ + unsigned long flags; + if ( !__on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&io_schedule_list_lock, flags); + if ( __on_blkdev_list(blkif) ) + { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = NULL; + blkif_put(blkif); + } + spin_unlock_irqrestore(&io_schedule_list_lock, flags); +} + +static void add_to_blkdev_list_tail(blkif_t *blkif) +{ + unsigned long flags; + if ( __on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&io_schedule_list_lock, flags); + if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) + { + list_add_tail(&blkif->blkdev_list, &io_schedule_list); + blkif_get(blkif); + } + spin_unlock_irqrestore(&io_schedule_list_lock, flags); +} + + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static void io_schedule(unsigned long unused) +{ + blkif_t *blkif; + struct list_head *ent; + + /* Queue up a batch of requests. */ + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&io_schedule_list) ) + { + ent = io_schedule_list.next; + blkif = list_entry(ent, blkif_t, blkdev_list); + blkif_get(blkif); + remove_from_blkdev_list(blkif); + if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) + add_to_blkdev_list_tail(blkif); + blkif_put(blkif); + } + + /* Push the batch through to disc. */ + run_task_queue(&tq_disk); +} + +static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0); + +static void maybe_trigger_io_schedule(void) +{ + /* + * Needed so that two processes, who together make the following predicate + * true, don't both read stale values and evaluate the predicate + * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... + */ + smp_mb(); + + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&io_schedule_list) ) + tasklet_schedule(&io_schedule_tasklet); +} + + + +/****************************************************************** + * COMPLETION CALLBACK -- Called as bh->b_end_io() + */ + +static void __end_block_io_op(pending_req_t *pending_req, int uptodate) +{ + unsigned long flags; + + /* An error fails the entire request. */ + if ( !uptodate ) + { + DPRINTK("Buffer not up-to-date at end of operation\n"); + pending_req->status = BLKIF_RSP_ERROR; + } + + if ( atomic_dec_and_test(&pending_req->pendcnt) ) + { + int pending_idx = pending_req - pending_reqs; + fast_flush_area(pending_idx, pending_req->nr_pages); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + spin_unlock_irqrestore(&pend_prod_lock, flags); + maybe_trigger_io_schedule(); + } +} + +static void end_block_io_op(struct buffer_head *bh, int uptodate) +{ + __end_block_io_op(bh->b_private, uptodate); + kmem_cache_free(buffer_head_cachep, bh); +} + + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_t *blkif = dev_id; + add_to_blkdev_list_tail(blkif); + maybe_trigger_io_schedule(); +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ + +static int do_block_io_op(blkif_t *blkif, int max_to_do) +{ + blkif_ring_t *blk_ring = blkif->blk_ring_base; + blkif_request_t *req; + BLK_RING_IDX i; + int more_to_do = 0; + + /* Take items off the comms ring, taking care not to overflow. */ + for ( i = blkif->blk_req_cons; + (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != + BLK_RING_SIZE); + i++ ) + { + if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) + { + more_to_do = 1; + break; + } + + req = &blk_ring->ring[MASK_BLK_IDX(i)].req; + switch ( req->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + dispatch_rw_block_io(blkif, req); + break; + + case BLKIF_OP_PROBE: + dispatch_probe(blkif, req); + break; + + default: + DPRINTK("error: unknown block io operation [%d]\n", + blk_ring->ring[i].req.operation); + make_response(blkif, blk_ring->ring[i].req.id, + blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR); + break; + } + } + + blkif->blk_req_cons = i; + return more_to_do; +} + +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req) +{ + int rsp = BLKIF_RSP_ERROR; + int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + /* We expect one buffer only. */ + if ( unlikely(req->nr_segments != 1) ) + goto out; + + /* Make sure the buffer is page-sized. */ + if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || + (blkif_last_sect(req->frame_and_sects[0]) != 7) ) + goto out; + + if ( HYPERVISOR_update_va_mapping_otherdomain( + MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, + (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, + 0, blkif->domid) ) + goto out; + + rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), + PAGE_SIZE / sizeof(vdisk_t)); + + out: + fast_flush_area(pending_idx, 1); + make_response(blkif, req->id, req->operation, rsp); +} + +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + struct buffer_head *bh; + int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; + short nr_sects; + unsigned long buffer, fas; + int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + pending_req_t *pending_req; + unsigned long remap_prot; + multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; + + /* We map virtual scatter/gather segments to physical segments. */ + int new_segs, nr_psegs = 0; + phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1]; + + /* Check that number of segments is sane. */ + if ( unlikely(req->nr_segments == 0) || + unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) + { + DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); + goto bad_descriptor; + } + + /* + * Check each address/size pair is sane, and convert into a + * physical device and block offset. Note that if the offset and size + * crosses a virtual extent boundary, we may end up with more + * physical scatter/gather segments than virtual segments. + */ + for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) + { + fas = req->frame_and_sects[i]; + buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); + nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; + + if ( nr_sects <= 0 ) + goto bad_descriptor; + + phys_seg[nr_psegs].dev = req->device; + phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; + phys_seg[nr_psegs].buffer = buffer; + phys_seg[nr_psegs].nr_sects = nr_sects; + + /* Translate the request into the relevant 'physical device' */ + new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation); + if ( new_segs < 0 ) + { + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + req->sector_number + tot_sects, + req->sector_number + tot_sects + nr_sects, + req->device); + goto bad_descriptor; + } + + nr_psegs += new_segs; + ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1)); + } + + /* Nonsensical zero-sized request? */ + if ( unlikely(nr_psegs == 0) ) + goto bad_descriptor; + + if ( operation == READ ) + remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW; + else + remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED; + + for ( i = 0; i < nr_psegs; i++ ) + { + mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; + mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot; + mcl[i].args[2] = 0; + mcl[i].args[3] = blkif->domid; + + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = + phys_seg[i].buffer >> PAGE_SHIFT; + } + + (void)HYPERVISOR_multicall(mcl, nr_psegs); + + for ( i = 0; i < nr_psegs; i++ ) + { + if ( unlikely(mcl[i].args[5] != 0) ) + { + DPRINTK("invalid buffer -- could not remap it\n"); + fast_flush_area(pending_idx, nr_psegs); + goto bad_descriptor; + } + } + + pending_req = &pending_reqs[pending_idx]; + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nr_psegs; + atomic_set(&pending_req->pendcnt, nr_psegs); + pending_cons++; + + blkif_get(blkif); + + /* Now we pass each segment down to the real blkdev layer. */ + for ( i = 0; i < nr_psegs; i++ ) + { + bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC); + if ( unlikely(bh == NULL) ) + { + __end_block_io_op(pending_req, 0); + continue; + } + memset(bh, 0, sizeof (struct buffer_head)); + + init_waitqueue_head(&bh->b_wait); + bh->b_size = phys_seg[i].nr_sects << 9; + bh->b_dev = phys_seg[i].dev; + bh->b_rdev = phys_seg[i].dev; + bh->b_rsector = (unsigned long)phys_seg[i].sector_number; + bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + + (phys_seg[i].buffer & ~PAGE_MASK); + bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); + bh->b_end_io = end_block_io_op; + bh->b_private = pending_req; + + bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | + (1 << BH_Req) | (1 << BH_Launder); + if ( operation == WRITE ) + bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); + + atomic_set(&bh->b_count, 1); + + /* Dispatch a single request. We'll flush it to disc later. */ + generic_make_request(operation, bh); + } + + return; + + bad_descriptor: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st) +{ + blkif_response_t *resp; + unsigned long flags; + + /* Place on the response ring for the relevant domain. */ + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + resp = &blkif->blk_ring_base-> + ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp; + resp->id = id; + resp->operation = op; + resp->status = st; + wmb(); + blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod; + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + /* Kick the relevant domain. */ + notify_via_evtchn(blkif->evtchn); +} + +void blkif_deschedule(blkif_t *blkif) +{ + remove_from_blkdev_list(blkif); +} + +static int __init init_module(void) +{ + int i; + + if ( !(start_info.flags & SIF_INITDOMAIN) + && !(start_info.flags & SIF_BLK_BE_DOMAIN) ) + return 0; + + blkif_interface_init(); + + if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) + BUG(); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + spin_lock_init(&io_schedule_list_lock); + INIT_LIST_HEAD(&io_schedule_list); + + buffer_head_cachep = kmem_cache_create( + "buffer_head_cache", sizeof(struct buffer_head), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + + blkif_ctrlif_init(); + + return 0; +} + +static void cleanup_module(void) +{ + BUG(); +} + +module_init(init_module); +module_exit(cleanup_module); diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c new file mode 100644 index 0000000000..6704fbb541 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c @@ -0,0 +1,436 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/vbd.c + * + * Routines for managing virtual block devices (VBDs). + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +void vbd_create(blkif_be_vbd_create_t *create) +{ + vbd_t *vbd; + rb_node_t **rb_p, *rb_parent = NULL; + blkif_t *blkif; + blkif_vdev_t vdevice = create->vdevice; + + blkif = blkif_find_by_handle(create->domid, create->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", + create->domid, create->blkif_handle); + create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb_p = &blkif->vbd_rb.rb_node; + while ( *rb_p != NULL ) + { + rb_parent = *rb_p; + vbd = rb_entry(rb_parent, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + { + rb_p = &rb_parent->rb_left; + } + else if ( vdevice > vbd->vdevice ) + { + rb_p = &rb_parent->rb_right; + } + else + { + DPRINTK("vbd_create attempted for already existing vbd\n"); + create->status = BLKIF_BE_STATUS_VBD_EXISTS; + goto out; + } + } + + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_create: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + goto out; + } + + vbd->vdevice = vdevice; + vbd->readonly = create->readonly; + vbd->type = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; + vbd->extents = NULL; + + rb_link_node(&vbd->rb, rb_parent, rb_p); + rb_insert_color(&vbd->rb, &blkif->vbd_rb); + + DPRINTK("Successful creation of vdev=%04x (dom=%u)\n", + vdevice, create->domid); + create->status = BLKIF_BE_STATUS_OKAY; + + out: + spin_unlock(&blkif->vbd_lock); +} + + +/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */ +void vbd_grow(blkif_be_vbd_grow_t *grow) +{ + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = grow->vdevice; + + blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_grow attempted for non-existent blkif (%u,%u)\n", + grow->domid, grow->blkif_handle); + grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) + { + DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n"); + grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + goto out; + } + + if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), + GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_grow: out of memory\n"); + grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + goto out; + } + + x->extent.device = grow->extent.device; + x->extent.sector_start = grow->extent.sector_start; + x->extent.sector_length = grow->extent.sector_length; + x->next = (blkif_extent_le_t *)NULL; + + for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) + continue; + + *px = x; + + DPRINTK("Successful grow of vdev=%04x (dom=%u)\n", + vdevice, grow->domid); + grow->status = BLKIF_BE_STATUS_OKAY; + + out: + spin_unlock(&blkif->vbd_lock); +} + + +void vbd_shrink(blkif_be_vbd_shrink_t *shrink) +{ + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = shrink->vdevice; + + blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_shrink attempted for non-existent blkif (%u,%u)\n", + shrink->domid, shrink->blkif_handle); + shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) + { + shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + goto out; + } + + if ( unlikely(vbd->extents == NULL) ) + { + shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + goto out; + } + + /* Find the last extent. We now know that there is at least one. */ + for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next ) + continue; + + x = *px; + *px = x->next; + kfree(x); + + shrink->status = BLKIF_BE_STATUS_OKAY; + + out: + spin_unlock(&blkif->vbd_lock); +} + + +void vbd_destroy(blkif_be_vbd_destroy_t *destroy) +{ + blkif_t *blkif; + vbd_t *vbd; + rb_node_t *rb; + blkif_extent_le_t *x, *t; + blkif_vdev_t vdevice = destroy->vdevice; + + blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", + destroy->domid, destroy->blkif_handle); + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + goto out; + + found: + rb_erase(rb, &blkif->vbd_rb); + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; + kfree(x); + x = t; + } + + out: + spin_unlock(&blkif->vbd_lock); +} + + +void destroy_all_vbds(blkif_t *blkif) +{ + vbd_t *vbd; + rb_node_t *rb; + blkif_extent_le_t *x, *t; + + spin_lock(&blkif->vbd_lock); + + while ( (rb = blkif->vbd_rb.rb_node) != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + + rb_erase(rb, &blkif->vbd_rb); + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; + kfree(x); + x = t; + } + } + + spin_unlock(&blkif->vbd_lock); +} + + +static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd) +{ + blkif_extent_le_t *x; + + vbd_info->device = vbd->vdevice; + vbd_info->info = vbd->type; + if ( vbd->readonly ) + vbd_info->info |= VDISK_FLAG_RO; + vbd_info->capacity = 0ULL; + for ( x = vbd->extents; x != NULL; x = x->next ) + vbd_info->capacity += x->extent.sector_length; + + return 0; +} + + +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds) +{ + int rc = 0, nr_vbds = 0; + rb_node_t *rb; + + spin_lock(&blkif->vbd_lock); + + if ( (rb = blkif->vbd_rb.rb_node) == NULL ) + goto out; + + new_subtree: + /* STEP 1. Find least node (it'll be left-most). */ + while ( rb->rb_left != NULL ) + rb = rb->rb_left; + + for ( ; ; ) + { + /* STEP 2. Dealt with left subtree. Now process current node. */ + if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], + rb_entry(rb, vbd_t, rb))) != 0 ) + goto out; + if ( ++nr_vbds == max_vbds ) + goto out; + + /* STEP 3. Process right subtree, if any. */ + if ( rb->rb_right != NULL ) + { + rb = rb->rb_right; + goto new_subtree; + } + + /* STEP 4. Done both subtrees. Head back through ancesstors. */ + for ( ; ; ) + { + /* We're done when we get back to the root node. */ + if ( rb->rb_parent == NULL ) + goto out; + /* If we are left of parent, then parent is next to process. */ + if ( rb->rb_parent->rb_left == rb ) + break; + /* If we are right of parent, then we climb to grandparent. */ + rb = rb->rb_parent; + } + + rb = rb->rb_parent; + } + + out: + spin_unlock(&blkif->vbd_lock); + return (rc == 0) ? nr_vbds : rc; +} + + +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation) +{ + blkif_extent_le_t *x; + vbd_t *vbd; + rb_node_t *rb; + blkif_sector_t sec_off; + unsigned long nr_secs; + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( pseg->dev < vbd->vdevice ) + rb = rb->rb_left; + else if ( pseg->dev > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + DPRINTK("vbd_translate; domain %u attempted to access " + "non-existent VBD.\n", blkif->domid); + + spin_unlock(&blkif->vbd_lock); + return -ENODEV; + + found: + + if ( (operation == WRITE) && vbd->readonly ) + { + spin_unlock(&blkif->vbd_lock); + return -EACCES; + } + + /* + * Now iterate through the list of blkif_extents, working out which should + * be used to perform the translation. + */ + sec_off = pseg->sector_number; + nr_secs = pseg->nr_sects; + for ( x = vbd->extents; x != NULL; x = x->next ) + { + if ( sec_off < x->extent.sector_length ) + { + pseg->dev = x->extent.device; + pseg->sector_number = x->extent.sector_start + sec_off; + if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) ) + goto overrun; + spin_unlock(&p->vbd_lock); + return 1; + } + sec_off -= x->extent.sector_length; + } + + DPRINTK("vbd_translate: end of vbd.\n"); + spin_unlock(&blkif->vbd_lock); + return -EACCES; + + /* + * Here we deal with overrun onto the following extent. We don't deal with + * overrun of more than one boundary since each request is restricted to + * 2^9 512-byte sectors, so it should be trivial for control software to + * ensure that extents are large enough to prevent excessive overrun. + */ + overrun: + + /* Adjust length of first chunk to run to end of first extent. */ + pseg[0].nr_sects = x->extent.sector_length - sec_off; + + /* Set second chunk buffer and length to start where first chunk ended. */ + pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9); + pseg[1].nr_sects = nr_secs - pseg[0].nr_sects; + + /* Now move to the next extent. Check it exists and is long enough! */ + if ( unlikely((x = x->next) == NULL) || + unlikely(x->extent.sector_length < pseg[1].nr_sects) ) + { + DPRINTK("vbd_translate: multiple overruns or end of vbd.\n"); + spin_unlock(&p->vbd_lock); + return -EACCES; + } + + /* Store the real device and start sector for the second chunk. */ + pseg[1].dev = x->extent.device; + pseg[1].sector_number = x->extent.sector_start; + + spin_unlock(&blkif->vbd_lock); + return 2; +} |