aboutsummaryrefslogtreecommitdiffstats
path: root/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend
diff options
context:
space:
mode:
Diffstat (limited to 'linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend')
-rw-r--r--linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile3
-rw-r--r--linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h108
-rw-r--r--linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c87
-rw-r--r--linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c233
-rw-r--r--linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c523
-rw-r--r--linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c436
6 files changed, 1390 insertions, 0 deletions
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile
new file mode 100644
index 0000000000..4c8c17367c
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := main.o control.o interface.o vbd.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h
new file mode 100644
index 0000000000..d9f1d22908
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/common.h
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm/ctrl_if.h>
+#include <asm/io.h>
+#include "../blkif.h"
+
+#ifndef NDEBUG
+#define ASSERT(_p) \
+ if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+ __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
+ __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct blkif_st {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned long shmem_frame;
+ unsigned int evtchn;
+ int irq;
+ /* Comms information. */
+ blkif_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+ BLK_RING_IDX blk_req_cons; /* Request consumer. */
+ BLK_RING_IDX blk_resp_prod; /* Private version of response producer. */
+ /* VBDs attached to this interface. */
+ rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */
+ spinlock_t vbd_lock; /* Protects VBD mapping. */
+ /* Private fields. */
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+ /*
+ * DISCONNECT response is deferred until pending requests are ack'ed.
+ * We therefore need to store the id from the original request.
+ */
+ u8 disconnect_rspid;
+ struct blkif_st *hash_next;
+ struct list_head blkdev_list;
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+} blkif_t;
+
+void blkif_create(blkif_be_create_t *create);
+void blkif_destroy(blkif_be_destroy_t *destroy);
+void blkif_connect(blkif_be_connect_t *connect);
+int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id);
+void __blkif_disconnect_complete(blkif_t *blkif);
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b) \
+ do { \
+ if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+ __blkif_disconnect_complete(_b); \
+ } while (0)
+
+/* An entry in a list of xen_extents. */
+typedef struct _blkif_extent_le {
+ blkif_extent_t extent; /* an individual extent */
+ struct _blkif_extent_le *next; /* and a pointer to the next */
+} blkif_extent_le_t;
+
+typedef struct _vbd {
+ blkif_vdev_t vdevice; /* what the domain refers to this vbd as */
+ unsigned char readonly; /* Non-zero -> read-only */
+ unsigned char type; /* XD_TYPE_xxx */
+ blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */
+ rb_node_t rb; /* for linking into R-B tree lookup struct */
+} vbd_t;
+
+void vbd_create(blkif_be_vbd_create_t *create);
+void vbd_grow(blkif_be_vbd_grow_t *grow);
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink);
+void vbd_destroy(blkif_be_vbd_destroy_t *delete);
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds);
+void destroy_all_vbds(blkif_t *blkif);
+
+/* Describes a [partial] disk extent (part of a block io request) */
+typedef struct {
+ unsigned short dev;
+ unsigned short nr_sects;
+ unsigned long buffer;
+ xen_sector_t sector_number;
+} phys_seg_t;
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation);
+
+void blkif_interface_init(void);
+void blkif_ctrlif_init(void);
+
+void blkif_deschedule(blkif_t *blkif);
+
+void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c
new file mode 100644
index 0000000000..0b26224651
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/control.c
+ *
+ * Routines for interfacing with the control plane.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+ DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype);
+
+ switch ( msg->subtype )
+ {
+ case CMSG_BLKIF_BE_CREATE:
+ if ( msg->length != sizeof(blkif_be_create_t) )
+ goto parse_error;
+ blkif_create((blkif_be_create_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_DESTROY:
+ if ( msg->length != sizeof(blkif_be_destroy_t) )
+ goto parse_error;
+ blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_CONNECT:
+ if ( msg->length != sizeof(blkif_be_connect_t) )
+ goto parse_error;
+ blkif_connect((blkif_be_connect_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_DISCONNECT:
+ if ( msg->length != sizeof(blkif_be_disconnect_t) )
+ goto parse_error;
+ if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) )
+ return; /* Sending the response is deferred until later. */
+ break;
+ case CMSG_BLKIF_BE_VBD_CREATE:
+ if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+ goto parse_error;
+ vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_VBD_DESTROY:
+ if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+ goto parse_error;
+ vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_VBD_GROW:
+ if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+ goto parse_error;
+ vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]);
+ break;
+ case CMSG_BLKIF_BE_VBD_SHRINK:
+ if ( msg->length != sizeof(blkif_be_vbd_shrink_t) )
+ goto parse_error;
+ vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]);
+ break;
+ default:
+ goto parse_error;
+ }
+
+ ctrl_if_send_response(msg);
+ return;
+
+ parse_error:
+ DPRINTK("Parse error while reading message subtype %d, len %d\n",
+ msg->subtype, msg->length);
+ msg->length = 0;
+ ctrl_if_send_response(msg);
+}
+
+void blkif_ctrlif_init(void)
+{
+ ctrl_msg_t cmsg;
+ blkif_be_driver_status_changed_t st;
+
+ (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
+
+ /* Send a driver-UP notification to the domain controller. */
+ cmsg.type = CMSG_BLKIF_BE;
+ cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED;
+ cmsg.length = sizeof(blkif_be_driver_status_changed_t);
+ st.status = BLKIF_DRIVER_STATUS_UP;
+ memcpy(cmsg.msg, &st, sizeof(st));
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c
new file mode 100644
index 0000000000..780d793c6c
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ *
+ * Block-device interface management.
+ *
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static kmem_cache_t *blkif_cachep;
+static blkif_t *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+ blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif != NULL) &&
+ ((blkif->domid != domid) || (blkif->handle != handle)) )
+ blkif = blkif->hash_next;
+ return blkif;
+}
+
+void __blkif_disconnect_complete(blkif_t *blkif)
+{
+ ctrl_msg_t cmsg;
+ blkif_be_disconnect_t disc;
+
+ /*
+ * These can't be done in __blkif_disconnect() because at that point there
+ * may be outstanding requests at the disc whose asynchronous responses
+ * must still be notified to the remote driver.
+ */
+ unbind_evtchn_from_irq(blkif->evtchn);
+ vfree(blkif->blk_ring_base);
+
+ /* Construct the deferred response message. */
+ cmsg.type = CMSG_BLKIF_BE;
+ cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT;
+ cmsg.id = blkif->disconnect_rspid;
+ cmsg.length = sizeof(blkif_be_disconnect_t);
+ disc.domid = blkif->domid;
+ disc.blkif_handle = blkif->handle;
+ disc.status = BLKIF_BE_STATUS_OKAY;
+ memcpy(cmsg.msg, &disc, sizeof(disc));
+
+ /*
+ * Make sure message is constructed /before/ status change, because
+ * after the status change the 'blkif' structure could be deallocated at
+ * any time. Also make sure we send the response /after/ status change,
+ * as otherwise a subsequent CONNECT request could spuriously fail if
+ * another CPU doesn't see the status change yet.
+ */
+ mb();
+ if ( blkif->status != DISCONNECTING )
+ BUG();
+ blkif->status = DISCONNECTED;
+ mb();
+
+ /* Send the successful response. */
+ ctrl_if_send_response(&cmsg);
+}
+
+void blkif_create(blkif_be_create_t *create)
+{
+ domid_t domid = create->domid;
+ unsigned int handle = create->blkif_handle;
+ blkif_t **pblkif, *blkif;
+
+ if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
+ {
+ DPRINTK("Could not create blkif: out of memory\n");
+ create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ return;
+ }
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ blkif->handle = handle;
+ blkif->status = DISCONNECTED;
+ spin_lock_init(&blkif->vbd_lock);
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 0);
+
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( *pblkif != NULL )
+ {
+ if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+ {
+ DPRINTK("Could not create blkif: already exists\n");
+ create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+ kmem_cache_free(blkif_cachep, blkif);
+ return;
+ }
+ pblkif = &(*pblkif)->hash_next;
+ }
+
+ blkif->hash_next = *pblkif;
+ *pblkif = blkif;
+
+ DPRINTK("Successfully created blkif\n");
+ create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+ domid_t domid = destroy->domid;
+ unsigned int handle = destroy->blkif_handle;
+ blkif_t **pblkif, *blkif;
+
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif = *pblkif) != NULL )
+ {
+ if ( (blkif->domid == domid) && (blkif->handle == handle) )
+ {
+ if ( blkif->status != DISCONNECTED )
+ goto still_connected;
+ goto destroy;
+ }
+ pblkif = &blkif->hash_next;
+ }
+
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+
+ still_connected:
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+ return;
+
+ destroy:
+ *pblkif = blkif->hash_next;
+ destroy_all_vbds(blkif);
+ kmem_cache_free(blkif_cachep, blkif);
+ destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_connect(blkif_be_connect_t *connect)
+{
+ domid_t domid = connect->domid;
+ unsigned int handle = connect->blkif_handle;
+ unsigned int evtchn = connect->evtchn;
+ unsigned long shmem_frame = connect->shmem_frame;
+ struct vm_struct *vma;
+ pgprot_t prot;
+ int error;
+ blkif_t *blkif;
+
+ blkif = blkif_find_by_handle(domid, handle);
+ if ( unlikely(blkif == NULL) )
+ {
+ DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n",
+ connect->domid, connect->blkif_handle);
+ connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+ {
+ connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ return;
+ }
+
+ prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+ error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
+ shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+ prot, domid);
+ if ( error != 0 )
+ {
+ if ( error == -ENOMEM )
+ connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ else if ( error == -EFAULT )
+ connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+ else
+ connect->status = BLKIF_BE_STATUS_ERROR;
+ vfree(vma->addr);
+ return;
+ }
+
+ if ( blkif->status != DISCONNECTED )
+ {
+ connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+ vfree(vma->addr);
+ return;
+ }
+
+ blkif->evtchn = evtchn;
+ blkif->irq = bind_evtchn_to_irq(evtchn);
+ blkif->shmem_frame = shmem_frame;
+ blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
+ blkif->status = CONNECTED;
+ blkif_get(blkif);
+
+ request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif);
+
+ connect->status = BLKIF_BE_STATUS_OKAY;
+}
+
+int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+ domid_t domid = disconnect->domid;
+ unsigned int handle = disconnect->blkif_handle;
+ blkif_t *blkif;
+
+ blkif = blkif_find_by_handle(domid, handle);
+ if ( unlikely(blkif == NULL) )
+ {
+ DPRINTK("blkif_disconnect attempted for non-existent blkif"
+ " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle);
+ disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return 1; /* Caller will send response error message. */
+ }
+
+ if ( blkif->status == CONNECTED )
+ {
+ blkif->status = DISCONNECTING;
+ blkif->disconnect_rspid = rsp_id;
+ wmb(); /* Let other CPUs see the status change. */
+ free_irq(blkif->irq, NULL);
+ blkif_deschedule(blkif);
+ blkif_put(blkif);
+ }
+
+ return 0; /* Caller should not send response message. */
+}
+
+void __init blkif_interface_init(void)
+{
+ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
+ 0, 0, NULL, NULL);
+ memset(blkif_hash, 0, sizeof(blkif_hash));
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c
new file mode 100644
index 0000000000..803af976d2
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c
@@ -0,0 +1,523 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * arch/xen/drivers/blkif/frontend
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ *
+ * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+
+/*
+ * NB. We place a page of padding between each buffer page to avoid incorrect
+ * merging of requests by the IDE and SCSI merging routines. Otherwise, two
+ * adjacent buffers in a scatter-gather request would have adjacent page
+ * numbers: since the merge routines don't realise that this is in *pseudophys*
+ * space, not real space, they may collapse the s-g elements!
+ */
+static unsigned long mmap_vstart;
+#define MMAP_PAGES_PER_REQUEST \
+ (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1))
+#define MMAP_PAGES \
+ (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg) \
+ (mmap_vstart + \
+ ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+ ((_seg) * 2 * PAGE_SIZE))
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+ blkif_t *blkif;
+ unsigned long id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+} pending_req_t;
+
+/*
+ * We can't allocate pending_req's in order, since they may complete out of
+ * order. We therefore maintain an allocation ring. This ring also indicates
+ * when enough work has been passed down -- at that point the allocation ring
+ * will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+static kmem_cache_t *buffer_head_cachep;
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, int st);
+
+static void fast_flush_area(int idx, int nr_pages)
+{
+ multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+ int i;
+
+ for ( i = 0; i < nr_pages; i++ )
+ {
+ mcl[i].op = __HYPERVISOR_update_va_mapping;
+ mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
+ mcl[i].args[1] = 0;
+ mcl[i].args[2] = 0;
+ }
+
+ mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB;
+ (void)HYPERVISOR_multicall(mcl, nr_pages);
+}
+
+
+/******************************************************************
+ * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
+ */
+
+static struct list_head io_schedule_list;
+static spinlock_t io_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+ return blkif->blkdev_list.next != NULL;
+}
+
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+ unsigned long flags;
+ if ( !__on_blkdev_list(blkif) ) return;
+ spin_lock_irqsave(&io_schedule_list_lock, flags);
+ if ( __on_blkdev_list(blkif) )
+ {
+ list_del(&blkif->blkdev_list);
+ blkif->blkdev_list.next = NULL;
+ blkif_put(blkif);
+ }
+ spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+ unsigned long flags;
+ if ( __on_blkdev_list(blkif) ) return;
+ spin_lock_irqsave(&io_schedule_list_lock, flags);
+ if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
+ {
+ list_add_tail(&blkif->blkdev_list, &io_schedule_list);
+ blkif_get(blkif);
+ }
+ spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void io_schedule(unsigned long unused)
+{
+ blkif_t *blkif;
+ struct list_head *ent;
+
+ /* Queue up a batch of requests. */
+ while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+ !list_empty(&io_schedule_list) )
+ {
+ ent = io_schedule_list.next;
+ blkif = list_entry(ent, blkif_t, blkdev_list);
+ blkif_get(blkif);
+ remove_from_blkdev_list(blkif);
+ if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+ add_to_blkdev_list_tail(blkif);
+ blkif_put(blkif);
+ }
+
+ /* Push the batch through to disc. */
+ run_task_queue(&tq_disk);
+}
+
+static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
+
+static void maybe_trigger_io_schedule(void)
+{
+ /*
+ * Needed so that two processes, who together make the following predicate
+ * true, don't both read stale values and evaluate the predicate
+ * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+ */
+ smp_mb();
+
+ if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+ !list_empty(&io_schedule_list) )
+ tasklet_schedule(&io_schedule_tasklet);
+}
+
+
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
+{
+ unsigned long flags;
+
+ /* An error fails the entire request. */
+ if ( !uptodate )
+ {
+ DPRINTK("Buffer not up-to-date at end of operation\n");
+ pending_req->status = BLKIF_RSP_ERROR;
+ }
+
+ if ( atomic_dec_and_test(&pending_req->pendcnt) )
+ {
+ int pending_idx = pending_req - pending_reqs;
+ fast_flush_area(pending_idx, pending_req->nr_pages);
+ make_response(pending_req->blkif, pending_req->id,
+ pending_req->operation, pending_req->status);
+ blkif_put(pending_req->blkif);
+ spin_lock_irqsave(&pend_prod_lock, flags);
+ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+ spin_unlock_irqrestore(&pend_prod_lock, flags);
+ maybe_trigger_io_schedule();
+ }
+}
+
+static void end_block_io_op(struct buffer_head *bh, int uptodate)
+{
+ __end_block_io_op(bh->b_private, uptodate);
+ kmem_cache_free(buffer_head_cachep, bh);
+}
+
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+ blkif_t *blkif = dev_id;
+ add_to_blkdev_list_tail(blkif);
+ maybe_trigger_io_schedule();
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
+{
+ blkif_ring_t *blk_ring = blkif->blk_ring_base;
+ blkif_request_t *req;
+ BLK_RING_IDX i;
+ int more_to_do = 0;
+
+ /* Take items off the comms ring, taking care not to overflow. */
+ for ( i = blkif->blk_req_cons;
+ (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) !=
+ BLK_RING_SIZE);
+ i++ )
+ {
+ if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
+ {
+ more_to_do = 1;
+ break;
+ }
+
+ req = &blk_ring->ring[MASK_BLK_IDX(i)].req;
+ switch ( req->operation )
+ {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ dispatch_rw_block_io(blkif, req);
+ break;
+
+ case BLKIF_OP_PROBE:
+ dispatch_probe(blkif, req);
+ break;
+
+ default:
+ DPRINTK("error: unknown block io operation [%d]\n",
+ blk_ring->ring[i].req.operation);
+ make_response(blkif, blk_ring->ring[i].req.id,
+ blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
+ break;
+ }
+ }
+
+ blkif->blk_req_cons = i;
+ return more_to_do;
+}
+
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
+{
+ int rsp = BLKIF_RSP_ERROR;
+ int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+ /* We expect one buffer only. */
+ if ( unlikely(req->nr_segments != 1) )
+ goto out;
+
+ /* Make sure the buffer is page-sized. */
+ if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+ (blkif_last_sect(req->frame_and_sects[0]) != 7) )
+ goto out;
+
+ if ( HYPERVISOR_update_va_mapping_otherdomain(
+ MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
+ (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
+ 0, blkif->domid) )
+ goto out;
+
+ rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0),
+ PAGE_SIZE / sizeof(vdisk_t));
+
+ out:
+ fast_flush_area(pending_idx, 1);
+ make_response(blkif, req->id, req->operation, rsp);
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+{
+ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
+ struct buffer_head *bh;
+ int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
+ short nr_sects;
+ unsigned long buffer, fas;
+ int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+ pending_req_t *pending_req;
+ unsigned long remap_prot;
+ multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+
+ /* We map virtual scatter/gather segments to physical segments. */
+ int new_segs, nr_psegs = 0;
+ phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
+
+ /* Check that number of segments is sane. */
+ if ( unlikely(req->nr_segments == 0) ||
+ unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+ {
+ DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
+ goto bad_descriptor;
+ }
+
+ /*
+ * Check each address/size pair is sane, and convert into a
+ * physical device and block offset. Note that if the offset and size
+ * crosses a virtual extent boundary, we may end up with more
+ * physical scatter/gather segments than virtual segments.
+ */
+ for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
+ {
+ fas = req->frame_and_sects[i];
+ buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
+ nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
+
+ if ( nr_sects <= 0 )
+ goto bad_descriptor;
+
+ phys_seg[nr_psegs].dev = req->device;
+ phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
+ phys_seg[nr_psegs].buffer = buffer;
+ phys_seg[nr_psegs].nr_sects = nr_sects;
+
+ /* Translate the request into the relevant 'physical device' */
+ new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
+ if ( new_segs < 0 )
+ {
+ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
+ operation == READ ? "read" : "write",
+ req->sector_number + tot_sects,
+ req->sector_number + tot_sects + nr_sects,
+ req->device);
+ goto bad_descriptor;
+ }
+
+ nr_psegs += new_segs;
+ ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
+ }
+
+ /* Nonsensical zero-sized request? */
+ if ( unlikely(nr_psegs == 0) )
+ goto bad_descriptor;
+
+ if ( operation == READ )
+ remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+ else
+ remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
+
+ for ( i = 0; i < nr_psegs; i++ )
+ {
+ mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
+ mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
+ mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
+ mcl[i].args[2] = 0;
+ mcl[i].args[3] = blkif->domid;
+
+ phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
+ phys_seg[i].buffer >> PAGE_SHIFT;
+ }
+
+ (void)HYPERVISOR_multicall(mcl, nr_psegs);
+
+ for ( i = 0; i < nr_psegs; i++ )
+ {
+ if ( unlikely(mcl[i].args[5] != 0) )
+ {
+ DPRINTK("invalid buffer -- could not remap it\n");
+ fast_flush_area(pending_idx, nr_psegs);
+ goto bad_descriptor;
+ }
+ }
+
+ pending_req = &pending_reqs[pending_idx];
+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nr_psegs;
+ atomic_set(&pending_req->pendcnt, nr_psegs);
+ pending_cons++;
+
+ blkif_get(blkif);
+
+ /* Now we pass each segment down to the real blkdev layer. */
+ for ( i = 0; i < nr_psegs; i++ )
+ {
+ bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
+ if ( unlikely(bh == NULL) )
+ {
+ __end_block_io_op(pending_req, 0);
+ continue;
+ }
+ memset(bh, 0, sizeof (struct buffer_head));
+
+ init_waitqueue_head(&bh->b_wait);
+ bh->b_size = phys_seg[i].nr_sects << 9;
+ bh->b_dev = phys_seg[i].dev;
+ bh->b_rdev = phys_seg[i].dev;
+ bh->b_rsector = (unsigned long)phys_seg[i].sector_number;
+ bh->b_data = (char *)MMAP_VADDR(pending_idx, i) +
+ (phys_seg[i].buffer & ~PAGE_MASK);
+ bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i));
+ bh->b_end_io = end_block_io_op;
+ bh->b_private = pending_req;
+
+ bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) |
+ (1 << BH_Req) | (1 << BH_Launder);
+ if ( operation == WRITE )
+ bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
+
+ atomic_set(&bh->b_count, 1);
+
+ /* Dispatch a single request. We'll flush it to disc later. */
+ generic_make_request(operation, bh);
+ }
+
+ return;
+
+ bad_descriptor:
+ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id,
+ unsigned short op, int st)
+{
+ blkif_response_t *resp;
+ unsigned long flags;
+
+ /* Place on the response ring for the relevant domain. */
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ resp = &blkif->blk_ring_base->
+ ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp;
+ resp->id = id;
+ resp->operation = op;
+ resp->status = st;
+ wmb();
+ blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+ /* Kick the relevant domain. */
+ notify_via_evtchn(blkif->evtchn);
+}
+
+void blkif_deschedule(blkif_t *blkif)
+{
+ remove_from_blkdev_list(blkif);
+}
+
+static int __init init_module(void)
+{
+ int i;
+
+ if ( !(start_info.flags & SIF_INITDOMAIN)
+ && !(start_info.flags & SIF_BLK_BE_DOMAIN) )
+ return 0;
+
+ blkif_interface_init();
+
+ if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+ BUG();
+
+ pending_cons = 0;
+ pending_prod = MAX_PENDING_REQS;
+ memset(pending_reqs, 0, sizeof(pending_reqs));
+ for ( i = 0; i < MAX_PENDING_REQS; i++ )
+ pending_ring[i] = i;
+
+ spin_lock_init(&io_schedule_list_lock);
+ INIT_LIST_HEAD(&io_schedule_list);
+
+ buffer_head_cachep = kmem_cache_create(
+ "buffer_head_cache", sizeof(struct buffer_head),
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+ blkif_ctrlif_init();
+
+ return 0;
+}
+
+static void cleanup_module(void)
+{
+ BUG();
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c
new file mode 100644
index 0000000000..6704fbb541
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c
@@ -0,0 +1,436 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/vbd.c
+ *
+ * Routines for managing virtual block devices (VBDs).
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+void vbd_create(blkif_be_vbd_create_t *create)
+{
+ vbd_t *vbd;
+ rb_node_t **rb_p, *rb_parent = NULL;
+ blkif_t *blkif;
+ blkif_vdev_t vdevice = create->vdevice;
+
+ blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
+ if ( unlikely(blkif == NULL) )
+ {
+ DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n",
+ create->domid, create->blkif_handle);
+ create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ spin_lock(&blkif->vbd_lock);
+
+ rb_p = &blkif->vbd_rb.rb_node;
+ while ( *rb_p != NULL )
+ {
+ rb_parent = *rb_p;
+ vbd = rb_entry(rb_parent, vbd_t, rb);
+ if ( vdevice < vbd->vdevice )
+ {
+ rb_p = &rb_parent->rb_left;
+ }
+ else if ( vdevice > vbd->vdevice )
+ {
+ rb_p = &rb_parent->rb_right;
+ }
+ else
+ {
+ DPRINTK("vbd_create attempted for already existing vbd\n");
+ create->status = BLKIF_BE_STATUS_VBD_EXISTS;
+ goto out;
+ }
+ }
+
+ if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
+ {
+ DPRINTK("vbd_create: out of memory\n");
+ create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ goto out;
+ }
+
+ vbd->vdevice = vdevice;
+ vbd->readonly = create->readonly;
+ vbd->type = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+ vbd->extents = NULL;
+
+ rb_link_node(&vbd->rb, rb_parent, rb_p);
+ rb_insert_color(&vbd->rb, &blkif->vbd_rb);
+
+ DPRINTK("Successful creation of vdev=%04x (dom=%u)\n",
+ vdevice, create->domid);
+ create->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+ spin_unlock(&blkif->vbd_lock);
+}
+
+
+/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */
+void vbd_grow(blkif_be_vbd_grow_t *grow)
+{
+ blkif_t *blkif;
+ blkif_extent_le_t **px, *x;
+ vbd_t *vbd = NULL;
+ rb_node_t *rb;
+ blkif_vdev_t vdevice = grow->vdevice;
+
+ blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle);
+ if ( unlikely(blkif == NULL) )
+ {
+ DPRINTK("vbd_grow attempted for non-existent blkif (%u,%u)\n",
+ grow->domid, grow->blkif_handle);
+ grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ spin_lock(&blkif->vbd_lock);
+
+ rb = blkif->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ break;
+ }
+
+ if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+ {
+ DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n");
+ grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+ goto out;
+ }
+
+ if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t),
+ GFP_KERNEL)) == NULL) )
+ {
+ DPRINTK("vbd_grow: out of memory\n");
+ grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ goto out;
+ }
+
+ x->extent.device = grow->extent.device;
+ x->extent.sector_start = grow->extent.sector_start;
+ x->extent.sector_length = grow->extent.sector_length;
+ x->next = (blkif_extent_le_t *)NULL;
+
+ for ( px = &vbd->extents; *px != NULL; px = &(*px)->next )
+ continue;
+
+ *px = x;
+
+ DPRINTK("Successful grow of vdev=%04x (dom=%u)\n",
+ vdevice, grow->domid);
+ grow->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+ spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink)
+{
+ blkif_t *blkif;
+ blkif_extent_le_t **px, *x;
+ vbd_t *vbd = NULL;
+ rb_node_t *rb;
+ blkif_vdev_t vdevice = shrink->vdevice;
+
+ blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle);
+ if ( unlikely(blkif == NULL) )
+ {
+ DPRINTK("vbd_shrink attempted for non-existent blkif (%u,%u)\n",
+ shrink->domid, shrink->blkif_handle);
+ shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ spin_lock(&blkif->vbd_lock);
+
+ rb = blkif->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ break;
+ }
+
+ if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+ {
+ shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+ goto out;
+ }
+
+ if ( unlikely(vbd->extents == NULL) )
+ {
+ shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
+ goto out;
+ }
+
+ /* Find the last extent. We now know that there is at least one. */
+ for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next )
+ continue;
+
+ x = *px;
+ *px = x->next;
+ kfree(x);
+
+ shrink->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+ spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy)
+{
+ blkif_t *blkif;
+ vbd_t *vbd;
+ rb_node_t *rb;
+ blkif_extent_le_t *x, *t;
+ blkif_vdev_t vdevice = destroy->vdevice;
+
+ blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
+ if ( unlikely(blkif == NULL) )
+ {
+ DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n",
+ destroy->domid, destroy->blkif_handle);
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ spin_lock(&blkif->vbd_lock);
+
+ rb = blkif->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( vdevice < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( vdevice > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ goto found;
+ }
+
+ destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+ goto out;
+
+ found:
+ rb_erase(rb, &blkif->vbd_rb);
+ x = vbd->extents;
+ kfree(vbd);
+
+ while ( x != NULL )
+ {
+ t = x->next;
+ kfree(x);
+ x = t;
+ }
+
+ out:
+ spin_unlock(&blkif->vbd_lock);
+}
+
+
+void destroy_all_vbds(blkif_t *blkif)
+{
+ vbd_t *vbd;
+ rb_node_t *rb;
+ blkif_extent_le_t *x, *t;
+
+ spin_lock(&blkif->vbd_lock);
+
+ while ( (rb = blkif->vbd_rb.rb_node) != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+
+ rb_erase(rb, &blkif->vbd_rb);
+ x = vbd->extents;
+ kfree(vbd);
+
+ while ( x != NULL )
+ {
+ t = x->next;
+ kfree(x);
+ x = t;
+ }
+ }
+
+ spin_unlock(&blkif->vbd_lock);
+}
+
+
+static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd)
+{
+ blkif_extent_le_t *x;
+
+ vbd_info->device = vbd->vdevice;
+ vbd_info->info = vbd->type;
+ if ( vbd->readonly )
+ vbd_info->info |= VDISK_FLAG_RO;
+ vbd_info->capacity = 0ULL;
+ for ( x = vbd->extents; x != NULL; x = x->next )
+ vbd_info->capacity += x->extent.sector_length;
+
+ return 0;
+}
+
+
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds)
+{
+ int rc = 0, nr_vbds = 0;
+ rb_node_t *rb;
+
+ spin_lock(&blkif->vbd_lock);
+
+ if ( (rb = blkif->vbd_rb.rb_node) == NULL )
+ goto out;
+
+ new_subtree:
+ /* STEP 1. Find least node (it'll be left-most). */
+ while ( rb->rb_left != NULL )
+ rb = rb->rb_left;
+
+ for ( ; ; )
+ {
+ /* STEP 2. Dealt with left subtree. Now process current node. */
+ if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds],
+ rb_entry(rb, vbd_t, rb))) != 0 )
+ goto out;
+ if ( ++nr_vbds == max_vbds )
+ goto out;
+
+ /* STEP 3. Process right subtree, if any. */
+ if ( rb->rb_right != NULL )
+ {
+ rb = rb->rb_right;
+ goto new_subtree;
+ }
+
+ /* STEP 4. Done both subtrees. Head back through ancesstors. */
+ for ( ; ; )
+ {
+ /* We're done when we get back to the root node. */
+ if ( rb->rb_parent == NULL )
+ goto out;
+ /* If we are left of parent, then parent is next to process. */
+ if ( rb->rb_parent->rb_left == rb )
+ break;
+ /* If we are right of parent, then we climb to grandparent. */
+ rb = rb->rb_parent;
+ }
+
+ rb = rb->rb_parent;
+ }
+
+ out:
+ spin_unlock(&blkif->vbd_lock);
+ return (rc == 0) ? nr_vbds : rc;
+}
+
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
+{
+ blkif_extent_le_t *x;
+ vbd_t *vbd;
+ rb_node_t *rb;
+ blkif_sector_t sec_off;
+ unsigned long nr_secs;
+
+ spin_lock(&blkif->vbd_lock);
+
+ rb = blkif->vbd_rb.rb_node;
+ while ( rb != NULL )
+ {
+ vbd = rb_entry(rb, vbd_t, rb);
+ if ( pseg->dev < vbd->vdevice )
+ rb = rb->rb_left;
+ else if ( pseg->dev > vbd->vdevice )
+ rb = rb->rb_right;
+ else
+ goto found;
+ }
+
+ DPRINTK("vbd_translate; domain %u attempted to access "
+ "non-existent VBD.\n", blkif->domid);
+
+ spin_unlock(&blkif->vbd_lock);
+ return -ENODEV;
+
+ found:
+
+ if ( (operation == WRITE) && vbd->readonly )
+ {
+ spin_unlock(&blkif->vbd_lock);
+ return -EACCES;
+ }
+
+ /*
+ * Now iterate through the list of blkif_extents, working out which should
+ * be used to perform the translation.
+ */
+ sec_off = pseg->sector_number;
+ nr_secs = pseg->nr_sects;
+ for ( x = vbd->extents; x != NULL; x = x->next )
+ {
+ if ( sec_off < x->extent.sector_length )
+ {
+ pseg->dev = x->extent.device;
+ pseg->sector_number = x->extent.sector_start + sec_off;
+ if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) )
+ goto overrun;
+ spin_unlock(&p->vbd_lock);
+ return 1;
+ }
+ sec_off -= x->extent.sector_length;
+ }
+
+ DPRINTK("vbd_translate: end of vbd.\n");
+ spin_unlock(&blkif->vbd_lock);
+ return -EACCES;
+
+ /*
+ * Here we deal with overrun onto the following extent. We don't deal with
+ * overrun of more than one boundary since each request is restricted to
+ * 2^9 512-byte sectors, so it should be trivial for control software to
+ * ensure that extents are large enough to prevent excessive overrun.
+ */
+ overrun:
+
+ /* Adjust length of first chunk to run to end of first extent. */
+ pseg[0].nr_sects = x->extent.sector_length - sec_off;
+
+ /* Set second chunk buffer and length to start where first chunk ended. */
+ pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9);
+ pseg[1].nr_sects = nr_secs - pseg[0].nr_sects;
+
+ /* Now move to the next extent. Check it exists and is long enough! */
+ if ( unlikely((x = x->next) == NULL) ||
+ unlikely(x->extent.sector_length < pseg[1].nr_sects) )
+ {
+ DPRINTK("vbd_translate: multiple overruns or end of vbd.\n");
+ spin_unlock(&p->vbd_lock);
+ return -EACCES;
+ }
+
+ /* Store the real device and start sector for the second chunk. */
+ pseg[1].dev = x->extent.device;
+ pseg[1].sector_number = x->extent.sector_start;
+
+ spin_unlock(&blkif->vbd_lock);
+ return 2;
+}