6 files changed, 1390 insertions, 0 deletions
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile
new file mode 100644
index 0000000000..4c8c17367c
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := main.o control.o interface.o vbd.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h
new file mode 100644
index 0000000000..d9f1d22908
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/common.h
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm/ctrl_if.h>
+#include <asm/io.h>
+#include "../blkif.h"
+
+#ifndef NDEBUG
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct blkif_st {
+    /* Unique identifier for this interface. */
+    domid_t          domid;
+    unsigned int     handle;
+    /* Physical parameters of the comms window. */
+    unsigned long    shmem_frame;
+    unsigned int     evtchn;
+    int              irq;
+    /* Comms information. */
+    blkif_ring_t    *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+    BLK_RING_IDX     blk_req_cons;  /* Request consumer. */
+    BLK_RING_IDX     blk_resp_prod; /* Private version of response producer. */
+    /* VBDs attached to this interface. */
+    rb_root_t        vbd_rb;        /* Mapping from 16-bit vdevices to VBDs. */
+    spinlock_t       vbd_lock;      /* Protects VBD mapping. */
+    /* Private fields. */
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    /*
+     * DISCONNECT response is deferred until pending requests are ack'ed.
+     * We therefore need to store the id from the original request.
+     */
+    u8               disconnect_rspid;
+    struct blkif_st *hash_next;
+    struct list_head blkdev_list;
+    spinlock_t       blk_ring_lock;
+    atomic_t         refcnt;
+} blkif_t;
+
+void blkif_create(blkif_be_create_t *create);
+void blkif_destroy(blkif_be_destroy_t *destroy);
+void blkif_connect(blkif_be_connect_t *connect);
+int  blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id);
+void __blkif_disconnect_complete(blkif_t *blkif);
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            __blkif_disconnect_complete(_b);      \
+    } while (0)
+
+/* An entry in a list of xen_extents. */
+typedef struct _blkif_extent_le { 
+    blkif_extent_t extent;               /* an individual extent */
+    struct _blkif_extent_le *next;       /* and a pointer to the next */ 
+} blkif_extent_le_t; 
+
+typedef struct _vbd { 
+    blkif_vdev_t       vdevice;   /* what the domain refers to this vbd as */
+    unsigned char      readonly;  /* Non-zero -> read-only */
+    unsigned char      type;      /* XD_TYPE_xxx */
+    blkif_extent_le_t *extents;   /* list of xen_extents making up this vbd */
+    rb_node_t          rb;        /* for linking into R-B tree lookup struct */
+} vbd_t; 
+
+void vbd_create(blkif_be_vbd_create_t *create); 
+void vbd_grow(blkif_be_vbd_grow_t *grow); 
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink);
+void vbd_destroy(blkif_be_vbd_destroy_t *delete); 
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds);
+void destroy_all_vbds(blkif_t *blkif);
+
+/* Describes a [partial] disk extent (part of a block io request) */
+typedef struct {
+    unsigned short dev;
+    unsigned short nr_sects;
+    unsigned long  buffer;
+    xen_sector_t   sector_number;
+} phys_seg_t;
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); 
+
+void blkif_interface_init(void);
+void blkif_ctrlif_init(void);
+
+void blkif_deschedule(blkif_t *blkif);
+
+void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c
new file mode 100644
index 0000000000..0b26224651
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/control.c
+ * 
+ * Routines for interfacing with the control plane.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype);
+    
+    switch ( msg->subtype )
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        blkif_create((blkif_be_create_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_CONNECT:
+        if ( msg->length != sizeof(blkif_be_connect_t) )
+            goto parse_error;
+        blkif_connect((blkif_be_connect_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_DISCONNECT:
+        if ( msg->length != sizeof(blkif_be_disconnect_t) )
+            goto parse_error;
+        if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) )
+            return; /* Sending the response is deferred until later. */
+        break;        
+    case CMSG_BLKIF_BE_VBD_CREATE:
+        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+            goto parse_error;
+        vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_DESTROY:
+        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+            goto parse_error;
+        vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_GROW:
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_SHRINK:
+        if ( msg->length != sizeof(blkif_be_vbd_shrink_t) )
+            goto parse_error;
+        vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]);
+        break;
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    DPRINTK("Parse error while reading message subtype %d, len %d\n",
+            msg->subtype, msg->length);
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+void blkif_ctrlif_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    blkif_be_driver_status_changed_t st;
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_BE;
+    cmsg.subtype   = CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(blkif_be_driver_status_changed_t);
+    st.status      = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c
new file mode 100644
index 0000000000..780d793c6c
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static kmem_cache_t *blkif_cachep;
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif != NULL) && 
+            ((blkif->domid != domid) || (blkif->handle != handle)) )
+        blkif = blkif->hash_next;
+    return blkif;
+}
+
+void __blkif_disconnect_complete(blkif_t *blkif)
+{
+    ctrl_msg_t            cmsg;
+    blkif_be_disconnect_t disc;
+
+    /*
+     * These can't be done in __blkif_disconnect() because at that point there
+     * may be outstanding requests at the disc whose asynchronous responses
+     * must still be notified to the remote driver.
+     */
+    unbind_evtchn_from_irq(blkif->evtchn);
+    vfree(blkif->blk_ring_base);
+
+    /* Construct the deferred response message. */
+    cmsg.type         = CMSG_BLKIF_BE;
+    cmsg.subtype      = CMSG_BLKIF_BE_DISCONNECT;
+    cmsg.id           = blkif->disconnect_rspid;
+    cmsg.length       = sizeof(blkif_be_disconnect_t);
+    disc.domid        = blkif->domid;
+    disc.blkif_handle = blkif->handle;
+    disc.status       = BLKIF_BE_STATUS_OKAY;
+    memcpy(cmsg.msg, &disc, sizeof(disc));
+
+    /*
+     * Make sure message is constructed /before/ status change, because
+     * after the status change the 'blkif' structure could be deallocated at
+     * any time. Also make sure we send the response /after/ status change,
+     * as otherwise a subsequent CONNECT request could spuriously fail if
+     * another CPU doesn't see the status change yet.
+     */
+    mb();
+    if ( blkif->status != DISCONNECTING )
+        BUG();
+    blkif->status = DISCONNECTED;
+    mb();
+
+    /* Send the successful response. */
+    ctrl_if_send_response(&cmsg);
+}
+
+void blkif_create(blkif_be_create_t *create)
+{
+    domid_t       domid  = create->domid;
+    unsigned int  handle = create->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
+    {
+        DPRINTK("Could not create blkif: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid  = domid;
+    blkif->handle = handle;
+    blkif->status = DISCONNECTED;
+    spin_lock_init(&blkif->vbd_lock);
+    spin_lock_init(&blkif->blk_ring_lock);
+    atomic_set(&blkif->refcnt, 0);
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif != NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            DPRINTK("Could not create blkif: already exists\n");
+            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+            kmem_cache_free(blkif_cachep, blkif);
+            return;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+
+    blkif->hash_next = *pblkif;
+    *pblkif = blkif;
+
+    DPRINTK("Successfully created blkif\n");
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif = *pblkif) != NULL )
+    {
+        if ( (blkif->domid == domid) && (blkif->handle == handle) )
+        {
+            if ( blkif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pblkif = &blkif->hash_next;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pblkif = blkif->hash_next;
+    destroy_all_vbds(blkif);
+    kmem_cache_free(blkif_cachep, blkif);
+    destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_connect(blkif_be_connect_t *connect)
+{
+    domid_t       domid  = connect->domid;
+    unsigned int  handle = connect->blkif_handle;
+    unsigned int  evtchn = connect->evtchn;
+    unsigned long shmem_frame = connect->shmem_frame;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
+    blkif_t      *blkif;
+
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", 
+                connect->domid, connect->blkif_handle); 
+        connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
+                                    shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                    prot, domid);
+    if ( error != 0 )
+    {
+        if ( error == -ENOMEM )
+            connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT )
+            connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+        else
+            connect->status = BLKIF_BE_STATUS_ERROR;
+        vfree(vma->addr);
+        return;
+    }
+
+    if ( blkif->status != DISCONNECTED )
+    {
+        connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+        vfree(vma->addr);
+        return;
+    }
+
+    blkif->evtchn        = evtchn;
+    blkif->irq           = bind_evtchn_to_irq(evtchn);
+    blkif->shmem_frame   = shmem_frame;
+    blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
+    blkif->status        = CONNECTED;
+    blkif_get(blkif);
+
+    request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif);
+
+    connect->status = BLKIF_BE_STATUS_OKAY;
+}
+
+int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+    domid_t       domid  = disconnect->domid;
+    unsigned int  handle = disconnect->blkif_handle;
+    blkif_t      *blkif;
+
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_disconnect attempted for non-existent blkif"
+                " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); 
+        disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return 1; /* Caller will send response error message. */
+    }
+
+    if ( blkif->status == CONNECTED )
+    {
+        blkif->status = DISCONNECTING;
+        blkif->disconnect_rspid = rsp_id;
+        wmb(); /* Let other CPUs see the status change. */
+        free_irq(blkif->irq, NULL);
+        blkif_deschedule(blkif);
+        blkif_put(blkif);
+    }
+
+    return 0; /* Caller should not send response message. */
+}
+
+void __init blkif_interface_init(void)
+{
+    blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+                                     0, 0, NULL, NULL);
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c
new file mode 100644
index 0000000000..803af976d2
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c
@@ -0,0 +1,523 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ * 
+ * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+
+/*
+ * NB. We place a page of padding between each buffer page to avoid incorrect
+ * merging of requests by the IDE and SCSI merging routines. Otherwise, two
+ * adjacent buffers in a scatter-gather request would have adjacent page
+ * numbers: since the merge routines don't realise that this is in *pseudophys*
+ * space, not real space, they may collapse the s-g elements!
+ */
+static unsigned long mmap_vstart;
+#define MMAP_PAGES_PER_REQUEST \
+    (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1))
+#define MMAP_PAGES             \
+    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg)                        \
+    (mmap_vstart +                                   \
+     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+     ((_seg) * 2 * PAGE_SIZE))
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+    blkif_t       *blkif;
+    unsigned long  id;
+    int            nr_pages;
+    atomic_t       pendcnt;
+    unsigned short operation;
+    int            status;
+} pending_req_t;
+
+/*
+ * We can't allocate pending_req's in order, since they may complete out of 
+ * order. We therefore maintain an allocation ring. This ring also indicates 
+ * when enough work has been passed down -- at that point the allocation ring 
+ * will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+static kmem_cache_t *buffer_head_cachep;
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st);
+
+static void fast_flush_area(int idx, int nr_pages)
+{
+    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+    int               i;
+
+    for ( i = 0; i < nr_pages; i++ )
+    {
+        mcl[i].op = __HYPERVISOR_update_va_mapping;
+        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
+        mcl[i].args[1] = 0;
+        mcl[i].args[2] = 0;
+    }
+
+    mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB;
+    (void)HYPERVISOR_multicall(mcl, nr_pages);
+}
+
+
+/******************************************************************
+ * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
+ */
+
+static struct list_head io_schedule_list;
+static spinlock_t io_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+    return blkif->blkdev_list.next != NULL;
+}
+
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( !__on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&io_schedule_list_lock, flags);
+    if ( __on_blkdev_list(blkif) )
+    {
+        list_del(&blkif->blkdev_list);
+        blkif->blkdev_list.next = NULL;
+        blkif_put(blkif);
+    }
+    spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( __on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&io_schedule_list_lock, flags);
+    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
+    {
+        list_add_tail(&blkif->blkdev_list, &io_schedule_list);
+        blkif_get(blkif);
+    }
+    spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void io_schedule(unsigned long unused)
+{
+    blkif_t          *blkif;
+    struct list_head *ent;
+
+    /* Queue up a batch of requests. */
+    while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+            !list_empty(&io_schedule_list) )
+    {
+        ent = io_schedule_list.next;
+        blkif = list_entry(ent, blkif_t, blkdev_list);
+        blkif_get(blkif);
+        remove_from_blkdev_list(blkif);
+        if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+            add_to_blkdev_list_tail(blkif);
+        blkif_put(blkif);
+    }
+
+    /* Push the batch through to disc. */
+    run_task_queue(&tq_disk);
+}
+
+static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
+
+static void maybe_trigger_io_schedule(void)
+{
+    /*
+     * Needed so that two processes, who together make the following predicate
+     * true, don't both read stale values and evaluate the predicate
+     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+     */
+    smp_mb();
+
+    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+         !list_empty(&io_schedule_list) )
+        tasklet_schedule(&io_schedule_tasklet);
+}
+
+
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
+{
+    unsigned long flags;
+
+    /* An error fails the entire request. */
+    if ( !uptodate )
+    {
+        DPRINTK("Buffer not up-to-date at end of operation\n");
+        pending_req->status = BLKIF_RSP_ERROR;
+    }
+
+    if ( atomic_dec_and_test(&pending_req->pendcnt) )
+    {
+        int pending_idx = pending_req - pending_reqs;
+        fast_flush_area(pending_idx, pending_req->nr_pages);
+        make_response(pending_req->blkif, pending_req->id,
+                      pending_req->operation, pending_req->status);
+        blkif_put(pending_req->blkif);
+        spin_lock_irqsave(&pend_prod_lock, flags);
+        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+        spin_unlock_irqrestore(&pend_prod_lock, flags);
+        maybe_trigger_io_schedule();
+    }
+}
+
+static void end_block_io_op(struct buffer_head *bh, int uptodate)
+{
+    __end_block_io_op(bh->b_private, uptodate);
+    kmem_cache_free(buffer_head_cachep, bh);
+}
+
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    blkif_t *blkif = dev_id;
+    add_to_blkdev_list_tail(blkif);
+    maybe_trigger_io_schedule();
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
+{
+    blkif_ring_t *blk_ring = blkif->blk_ring_base;
+    blkif_request_t *req;
+    BLK_RING_IDX i;
+    int more_to_do = 0;
+
+    /* Take items off the comms ring, taking care not to overflow. */
+    for ( i = blkif->blk_req_cons; 
+          (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != 
+                                        BLK_RING_SIZE);
+          i++ )
+    {
+        if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
+        {
+            more_to_do = 1;
+            break;
+        }
+        
+        req = &blk_ring->ring[MASK_BLK_IDX(i)].req;
+        switch ( req->operation )
+        {
+        case BLKIF_OP_READ:
+        case BLKIF_OP_WRITE:
+            dispatch_rw_block_io(blkif, req);
+            break;
+
+        case BLKIF_OP_PROBE:
+            dispatch_probe(blkif, req);
+            break;
+
+        default:
+            DPRINTK("error: unknown block io operation [%d]\n",
+                    blk_ring->ring[i].req.operation);
+            make_response(blkif, blk_ring->ring[i].req.id, 
+                          blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
+            break;
+        }
+    }
+
+    blkif->blk_req_cons = i;
+    return more_to_do;
+}
+
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
+{
+    int rsp = BLKIF_RSP_ERROR;
+    int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+    /* We expect one buffer only. */
+    if ( unlikely(req->nr_segments != 1) )
+        goto out;
+
+    /* Make sure the buffer is page-sized. */
+    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+         (blkif_last_sect(req->frame_and_sects[0]) != 7) )
+        goto out;
+
+    if ( HYPERVISOR_update_va_mapping_otherdomain(
+        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
+        (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
+        0, blkif->domid) )
+        goto out;
+
+    rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
+                    PAGE_SIZE / sizeof(vdisk_t));
+
+ out:
+    fast_flush_area(pending_idx, 1);
+    make_response(blkif, req->id, req->operation, rsp);
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+{
+    extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
+    struct buffer_head *bh;
+    int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
+    short nr_sects;
+    unsigned long buffer, fas;
+    int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+    pending_req_t *pending_req;
+    unsigned long  remap_prot;
+    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+
+    /* We map virtual scatter/gather segments to physical segments. */
+    int new_segs, nr_psegs = 0;
+    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
+
+    /* Check that number of segments is sane. */
+    if ( unlikely(req->nr_segments == 0) || 
+         unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+    {
+        DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
+        goto bad_descriptor;
+    }
+
+    /*
+     * Check each address/size pair is sane, and convert into a
+     * physical device and block offset. Note that if the offset and size
+     * crosses a virtual extent boundary, we may end up with more
+     * physical scatter/gather segments than virtual segments.
+     */
+    for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
+    {
+        fas      = req->frame_and_sects[i];
+        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
+        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
+
+        if ( nr_sects <= 0 )
+            goto bad_descriptor;
+
+        phys_seg[nr_psegs].dev           = req->device;
+        phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
+        phys_seg[nr_psegs].buffer        = buffer;
+        phys_seg[nr_psegs].nr_sects      = nr_sects;
+
+        /* Translate the request into the relevant 'physical device' */
+        new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
+        if ( new_segs < 0 )
+        { 
+            DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
+                    operation == READ ? "read" : "write", 
+                    req->sector_number + tot_sects, 
+                    req->sector_number + tot_sects + nr_sects, 
+                    req->device); 
+            goto bad_descriptor;
+        }
+  
+        nr_psegs += new_segs;
+        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
+    }
+
+    /* Nonsensical zero-sized request? */
+    if ( unlikely(nr_psegs == 0) )
+        goto bad_descriptor;
+
+    if ( operation == READ )
+        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+    else
+        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
+
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
+        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
+        mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
+        mcl[i].args[2] = 0;
+        mcl[i].args[3] = blkif->domid;
+
+        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
+            phys_seg[i].buffer >> PAGE_SHIFT;
+    }
+
+    (void)HYPERVISOR_multicall(mcl, nr_psegs);
+
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        if ( unlikely(mcl[i].args[5] != 0) )
+        {
+            DPRINTK("invalid buffer -- could not remap it\n");
+            fast_flush_area(pending_idx, nr_psegs);
+            goto bad_descriptor;
+        }
+    }
+
+    pending_req = &pending_reqs[pending_idx];
+    pending_req->blkif     = blkif;
+    pending_req->id        = req->id;
+    pending_req->operation = operation;
+    pending_req->status    = BLKIF_RSP_OKAY;
+    pending_req->nr_pages  = nr_psegs;
+    atomic_set(&pending_req->pendcnt, nr_psegs);
+    pending_cons++;
+
+    blkif_get(blkif);
+
+    /* Now we pass each segment down to the real blkdev layer. */
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
+        if ( unlikely(bh == NULL) )
+        {
+            __end_block_io_op(pending_req, 0);
+            continue;
+        }
+        memset(bh, 0, sizeof (struct buffer_head));
+
+        init_waitqueue_head(&bh->b_wait);
+        bh->b_size          = phys_seg[i].nr_sects << 9;
+        bh->b_dev           = phys_seg[i].dev;
+        bh->b_rdev          = phys_seg[i].dev;
+        bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
+        bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
+            (phys_seg[i].buffer & ~PAGE_MASK);
+        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
+        bh->b_end_io        = end_block_io_op;
+        bh->b_private       = pending_req;
+
+        bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | 
+            (1 << BH_Req) | (1 << BH_Launder);
+        if ( operation == WRITE )
+            bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
+
+        atomic_set(&bh->b_count, 1);
+
+        /* Dispatch a single request. We'll flush it to disc later. */
+        generic_make_request(operation, bh);
+    }
+
+    return;
+
+ bad_descriptor:
+    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+} 
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st)
+{
+    blkif_response_t *resp;
+    unsigned long     flags;
+
+    /* Place on the response ring for the relevant domain. */ 
+    spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+    resp = &blkif->blk_ring_base->
+        ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp;
+    resp->id        = id;
+    resp->operation = op;
+    resp->status    = st;
+    wmb();
+    blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
+    spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+    /* Kick the relevant domain. */
+    notify_via_evtchn(blkif->evtchn);
+}
+
+void blkif_deschedule(blkif_t *blkif)
+{
+    remove_from_blkdev_list(blkif);
+}
+
+static int __init init_module(void)
+{
+    int i;
+
+    if ( !(start_info.flags & SIF_INITDOMAIN)
+	 && !(start_info.flags & SIF_BLK_BE_DOMAIN) )
+        return 0;
+
+    blkif_interface_init();
+
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+        BUG();
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    memset(pending_reqs, 0, sizeof(pending_reqs));
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+    
+    spin_lock_init(&io_schedule_list_lock);
+    INIT_LIST_HEAD(&io_schedule_list);
+
+    buffer_head_cachep = kmem_cache_create(
+        "buffer_head_cache", sizeof(struct buffer_head),
+        0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+    blkif_ctrlif_init();
+
+    return 0;
+}
+
+static void cleanup_module(void)
+{
+    BUG();
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c
new file mode 100644
index 0000000000..6704fbb541
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c
@@ -0,0 +1,436 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/vbd.c
+ * 
+ * Routines for managing virtual block devices (VBDs).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+void vbd_create(blkif_be_vbd_create_t *create) 
+{
+    vbd_t       *vbd; 
+    rb_node_t  **rb_p, *rb_parent = NULL;
+    blkif_t     *blkif;
+    blkif_vdev_t vdevice = create->vdevice;
+
+    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", 
+                create->domid, create->blkif_handle); 
+        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb_p = &blkif->vbd_rb.rb_node;
+    while ( *rb_p != NULL )
+    {
+        rb_parent = *rb_p;
+        vbd = rb_entry(rb_parent, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+        {
+            rb_p = &rb_parent->rb_left;
+        }
+        else if ( vdevice > vbd->vdevice )
+        {
+            rb_p = &rb_parent->rb_right;
+        }
+        else
+        {
+            DPRINTK("vbd_create attempted for already existing vbd\n");
+            create->status = BLKIF_BE_STATUS_VBD_EXISTS;
+            goto out;
+        }
+    }
+
+    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
+    {
+        DPRINTK("vbd_create: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        goto out;
+    }
+
+    vbd->vdevice  = vdevice; 
+    vbd->readonly = create->readonly;
+    vbd->type     = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+    vbd->extents  = NULL; 
+
+    rb_link_node(&vbd->rb, rb_parent, rb_p);
+    rb_insert_color(&vbd->rb, &blkif->vbd_rb);
+
+    DPRINTK("Successful creation of vdev=%04x (dom=%u)\n",
+            vdevice, create->domid);
+    create->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */
+void vbd_grow(blkif_be_vbd_grow_t *grow) 
+{
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    rb_node_t          *rb;
+    blkif_vdev_t        vdevice = grow->vdevice;
+
+    blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_grow attempted for non-existent blkif (%u,%u)\n", 
+                grow->domid, grow->blkif_handle); 
+        grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            break;
+    }
+
+    if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+    {
+        DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n");
+        grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        goto out;
+    } 
+
+    if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 
+                               GFP_KERNEL)) == NULL) )
+    {
+        DPRINTK("vbd_grow: out of memory\n");
+        grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        goto out;
+    }
+ 
+    x->extent.device        = grow->extent.device; 
+    x->extent.sector_start  = grow->extent.sector_start; 
+    x->extent.sector_length = grow->extent.sector_length; 
+    x->next                 = (blkif_extent_le_t *)NULL; 
+
+    for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) 
+        continue;
+
+    *px = x;
+
+    DPRINTK("Successful grow of vdev=%04x (dom=%u)\n",
+            vdevice, grow->domid);
+    grow->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink)
+{
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    rb_node_t          *rb;
+    blkif_vdev_t        vdevice = shrink->vdevice;
+
+    blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_shrink attempted for non-existent blkif (%u,%u)\n", 
+                shrink->domid, shrink->blkif_handle); 
+        shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            break;
+    }
+
+    if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+    {
+        shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        goto out;
+    }
+
+    if ( unlikely(vbd->extents == NULL) )
+    {
+        shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
+        goto out;
+    }
+
+    /* Find the last extent. We now know that there is at least one. */
+    for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next )
+        continue;
+
+    x   = *px;
+    *px = x->next;
+    kfree(x);
+
+    shrink->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy) 
+{
+    blkif_t           *blkif;
+    vbd_t             *vbd;
+    rb_node_t         *rb;
+    blkif_extent_le_t *x, *t;
+    blkif_vdev_t       vdevice = destroy->vdevice;
+
+    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
+                destroy->domid, destroy->blkif_handle); 
+        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            goto found;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+    goto out;
+
+ found:
+    rb_erase(rb, &blkif->vbd_rb);
+    x = vbd->extents;
+    kfree(vbd);
+
+    while ( x != NULL )
+    {
+        t = x->next;
+        kfree(x);
+        x = t;
+    }
+    
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void destroy_all_vbds(blkif_t *blkif)
+{
+    vbd_t *vbd;
+    rb_node_t *rb;
+    blkif_extent_le_t *x, *t;
+
+    spin_lock(&blkif->vbd_lock);
+
+    while ( (rb = blkif->vbd_rb.rb_node) != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+
+        rb_erase(rb, &blkif->vbd_rb);
+        x = vbd->extents;
+        kfree(vbd);
+        
+        while ( x != NULL )
+        {
+            t = x->next;
+            kfree(x);
+            x = t;
+        }          
+    }
+
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd)
+{
+    blkif_extent_le_t *x; 
+
+    vbd_info->device = vbd->vdevice; 
+    vbd_info->info   = vbd->type;
+    if ( vbd->readonly )
+        vbd_info->info |= VDISK_FLAG_RO; 
+    vbd_info->capacity = 0ULL;
+    for ( x = vbd->extents; x != NULL; x = x->next )
+        vbd_info->capacity += x->extent.sector_length; 
+        
+    return 0;
+}
+
+
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds)
+{
+    int rc = 0, nr_vbds = 0;
+    rb_node_t *rb;
+
+    spin_lock(&blkif->vbd_lock);
+
+    if ( (rb = blkif->vbd_rb.rb_node) == NULL )
+        goto out;
+
+ new_subtree:
+    /* STEP 1. Find least node (it'll be left-most). */
+    while ( rb->rb_left != NULL )
+        rb = rb->rb_left;
+
+    for ( ; ; )
+    {
+        /* STEP 2. Dealt with left subtree. Now process current node. */
+        if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], 
+                                    rb_entry(rb, vbd_t, rb))) != 0 )
+            goto out;
+        if ( ++nr_vbds == max_vbds )
+            goto out;
+
+        /* STEP 3. Process right subtree, if any. */
+        if ( rb->rb_right != NULL )
+        {
+            rb = rb->rb_right;
+            goto new_subtree;
+        }
+
+        /* STEP 4. Done both subtrees. Head back through ancesstors. */
+        for ( ; ; ) 
+        {
+            /* We're done when we get back to the root node. */
+            if ( rb->rb_parent == NULL )
+                goto out;
+            /* If we are left of parent, then parent is next to process. */
+            if ( rb->rb_parent->rb_left == rb )
+                break;
+            /* If we are right of parent, then we climb to grandparent. */
+            rb = rb->rb_parent;
+        }
+
+        rb = rb->rb_parent;
+    }
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+    return (rc == 0) ? nr_vbds : rc;  
+}
+
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
+{
+    blkif_extent_le_t *x; 
+    vbd_t             *vbd;
+    rb_node_t         *rb;
+    blkif_sector_t     sec_off;
+    unsigned long      nr_secs;
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( pseg->dev < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( pseg->dev > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            goto found;
+    }
+
+    DPRINTK("vbd_translate; domain %u attempted to access "
+            "non-existent VBD.\n", blkif->domid);
+
+    spin_unlock(&blkif->vbd_lock);
+    return -ENODEV; 
+
+ found:
+
+    if ( (operation == WRITE) && vbd->readonly )
+    {
+        spin_unlock(&blkif->vbd_lock);
+        return -EACCES; 
+    }
+
+    /*
+     * Now iterate through the list of blkif_extents, working out which should 
+     * be used to perform the translation.
+     */
+    sec_off = pseg->sector_number; 
+    nr_secs = pseg->nr_sects;
+    for ( x = vbd->extents; x != NULL; x = x->next )
+    { 
+        if ( sec_off < x->extent.sector_length )
+        {
+            pseg->dev = x->extent.device; 
+            pseg->sector_number = x->extent.sector_start + sec_off;
+            if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) )
+                goto overrun;
+            spin_unlock(&p->vbd_lock);
+            return 1;
+        } 
+        sec_off -= x->extent.sector_length; 
+    }
+
+    DPRINTK("vbd_translate: end of vbd.\n");
+    spin_unlock(&blkif->vbd_lock);
+    return -EACCES; 
+
+    /*
+     * Here we deal with overrun onto the following extent. We don't deal with 
+     * overrun of more than one boundary since each request is restricted to 
+     * 2^9 512-byte sectors, so it should be trivial for control software to 
+     * ensure that extents are large enough to prevent excessive overrun.
+     */
+ overrun:
+
+    /* Adjust length of first chunk to run to end of first extent. */
+    pseg[0].nr_sects = x->extent.sector_length - sec_off;
+
+    /* Set second chunk buffer and length to start where first chunk ended. */
+    pseg[1].buffer   = pseg[0].buffer + (pseg[0].nr_sects << 9);
+    pseg[1].nr_sects = nr_secs - pseg[0].nr_sects;
+
+    /* Now move to the next extent. Check it exists and is long enough! */
+    if ( unlikely((x = x->next) == NULL) || 
+         unlikely(x->extent.sector_length < pseg[1].nr_sects) )
+    {
+        DPRINTK("vbd_translate: multiple overruns or end of vbd.\n");
+        spin_unlock(&p->vbd_lock);
+        return -EACCES;
+    }
+
+    /* Store the real device and start sector for the second chunk. */
+    pseg[1].dev           = x->extent.device;
+    pseg[1].sector_number = x->extent.sector_start;
+    
+    spin_unlock(&blkif->vbd_lock);
+    return 2;
+}