36 files changed, 8884 insertions, 0 deletions
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/Makefile
new file mode 100644
index 0000000000..9fb2227978
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := balloon.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c
new file mode 100644
index 0000000000..56237380f3
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c
@@ -0,0 +1,274 @@
+/******************************************************************************
+ * balloon.c
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <asm/xen_proc.h>
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+
+#include <asm/hypervisor.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+
+/* USER DEFINES -- THESE SHOULD BE COPIED TO USER-SPACE TOOLS */
+#define USER_INFLATE_BALLOON  1   /* return mem to hypervisor */
+#define USER_DEFLATE_BALLOON  2   /* claim mem from hypervisor */
+typedef struct user_balloon_op {
+    unsigned int  op;
+    unsigned long size;
+} user_balloon_op_t;
+/* END OF USER DEFINE */
+
+/* Dead entry written into ballon-owned entries in the PMT. */
+#define DEAD 0xdeadbeef
+
+static struct proc_dir_entry *balloon_pde;
+unsigned long credit;
+
+static inline pte_t *get_ptep(unsigned long addr)
+{
+    pgd_t *pgd; pmd_t *pmd; pte_t *ptep;
+    pgd = pgd_offset_k(addr);
+
+    if ( pgd_none(*pgd) || pgd_bad(*pgd) ) BUG();
+
+    pmd = pmd_offset(pgd, addr);
+    if ( pmd_none(*pmd) || pmd_bad(*pmd) ) BUG();
+
+    ptep = pte_offset(pmd, addr);
+
+    return ptep;
+}
+
+/* main function for relinquishing bit of memory */
+static unsigned long inflate_balloon(unsigned long num_pages)
+{
+    unsigned long *parray;
+    unsigned long *currp;
+    unsigned long curraddr;
+    unsigned long ret = 0;
+    unsigned long vaddr;
+    unsigned long i, j;
+
+    parray = (unsigned long *)kmalloc(num_pages * sizeof(unsigned long),
+                                      GFP_KERNEL);
+    currp = parray;
+
+    for ( i = 0; i < num_pages; i++ )
+    {
+        /* Try to obtain a free page (has to be done with GFP_ATOMIC). */
+        vaddr = __get_free_page(GFP_ATOMIC);
+
+        /* If allocation fails then free all reserved pages. */
+        if ( vaddr == 0 )
+        {
+            printk("Unable to inflate balloon by %ld, only %ld pages free.",
+                   num_pages, i);
+            currp = parray;
+            for(j = 0; j < i; j++){
+                free_page(*currp++);
+            }
+            goto cleanup;
+        }
+
+        *currp++ = vaddr;
+    }
+
+
+    currp = parray;
+    for ( i = 0; i < num_pages; i++ )
+    {
+        curraddr = *currp;
+        *currp = virt_to_machine(*currp) >> PAGE_SHIFT;
+        queue_l1_entry_update(get_ptep(curraddr), 0);
+        phys_to_machine_mapping[__pa(curraddr) >> PAGE_SHIFT] = DEAD;
+        currp++;
+    }
+
+    XEN_flush_page_update_queue();
+
+    ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, 
+                                parray, num_pages);
+    if ( unlikely(ret != num_pages) )
+    {
+        printk("Unable to inflate balloon, error %lx\n", ret);
+        goto cleanup;
+    }
+
+    credit += num_pages;
+    ret = num_pages;
+
+ cleanup:
+    kfree(parray);
+
+    return ret;
+}
+
+/* install new mem pages obtained by deflate_balloon. function walks 
+ * phys->machine mapping table looking for DEAD entries and populates
+ * them.
+ */
+static unsigned long process_new_pages(unsigned long * parray, 
+                                       unsigned long num)
+{
+    /* currently, this function is rather simplistic as 
+     * it is assumed that domain reclaims only number of 
+     * pages previously released. this is to change soon
+     * and the code to extend page tables etc. will be 
+     * incorporated here.
+     */
+     
+    unsigned long tot_pages = start_info.nr_pages;   
+    unsigned long * curr = parray;
+    unsigned long num_installed;
+    unsigned long i;
+
+    num_installed = 0;
+    for ( i = 0; (i < tot_pages) && (num_installed < num); i++ )
+    {
+        if ( phys_to_machine_mapping[i] == DEAD )
+        {
+            phys_to_machine_mapping[i] = *curr;
+            queue_l1_entry_update(
+                (pte_t *)((i << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE), i);
+            queue_l1_entry_update(
+                get_ptep((unsigned long)__va(i << PAGE_SHIFT)),
+                ((*curr) << PAGE_SHIFT) | pgprot_val(PAGE_KERNEL));
+
+            *curr = (unsigned long)__va(i << PAGE_SHIFT);
+            curr++;
+            num_installed++;
+        }
+    }
+
+    /* now, this is tricky (and will also change for machine addrs that 
+      * are mapped to not previously released addresses). we free pages
+      * that were allocated by get_free_page (the mappings are different 
+      * now, of course).
+      */
+    curr = parray;
+    for ( i = 0; i < num_installed; i++ )
+    {
+        free_page(*curr);
+        curr++;
+    }
+
+    return num_installed;
+}
+
+unsigned long deflate_balloon(unsigned long num_pages)
+{
+    unsigned long ret;
+    unsigned long * parray;
+
+    if ( num_pages > credit )
+    {
+        printk("Can not allocate more pages than previously released.\n");
+        return -EAGAIN;
+    }
+
+    parray = (unsigned long *)kmalloc(num_pages * sizeof(unsigned long), 
+                                      GFP_KERNEL);
+
+    ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation, 
+                                parray, num_pages);
+    if ( unlikely(ret != num_pages) )
+    {
+        printk("Unable to deflate balloon, error %lx\n", ret);
+        goto cleanup;
+    }
+
+    if ( (ret = process_new_pages(parray, num_pages)) < num_pages )
+    {
+        printk("Unable to deflate balloon by specified %lx pages, only %lx.\n",
+               num_pages, ret);
+        goto cleanup;
+    }
+
+    ret = num_pages;
+    credit -= num_pages;
+
+ cleanup:
+    kfree(parray);
+
+    return ret;
+}
+
+static int balloon_write(struct file *file, const char *buffer,
+                         u_long count, void *data)
+{
+    user_balloon_op_t bop;
+
+    /* Only admin can play with the balloon :) */
+    if ( !capable(CAP_SYS_ADMIN) )
+        return -EPERM;
+
+    if ( copy_from_user(&bop, buffer, sizeof(bop)) )
+        return -EFAULT;
+
+    switch ( bop.op )
+    {
+    case USER_INFLATE_BALLOON:
+        if ( inflate_balloon(bop.size) < bop.size )
+            return -EAGAIN;
+        break;
+        
+    case USER_DEFLATE_BALLOON:
+        deflate_balloon(bop.size);
+        break;
+
+    default:
+        printk("Unknown command to balloon driver.");
+        return -EFAULT;
+    }
+
+    return sizeof(bop);
+}
+
+/*
+ * main balloon driver initialization function.
+ */
+static int __init init_module(void)
+{
+    printk(KERN_ALERT "Starting Xen Balloon driver\n");
+
+    credit = 0;
+
+    if ( (balloon_pde = create_xen_proc_entry("balloon", 0600)) == NULL )
+    {
+        printk(KERN_ALERT "Unable to create balloon driver proc entry!");
+        return -1;
+    }
+
+    balloon_pde->write_proc = balloon_write;
+
+    return 0;
+}
+
+static void __exit cleanup_module(void)
+{
+    if ( balloon_pde != NULL )
+    {
+        remove_xen_proc_entry("balloon");
+        balloon_pde = NULL;
+    }
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
+
+
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/Makefile
new file mode 100644
index 0000000000..20c8192d3d
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/Makefile
@@ -0,0 +1,10 @@
+
+O_TARGET := drv.o
+
+subdir-y += frontend
+obj-y    += frontend/drv.o
+
+subdir-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend
+obj-$(CONFIG_XEN_PHYSDEV_ACCESS)    += backend/drv.o
+
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile
new file mode 100644
index 0000000000..4c8c17367c
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := main.o control.o interface.o vbd.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h
new file mode 100644
index 0000000000..d9f1d22908
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/common.h
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm/ctrl_if.h>
+#include <asm/io.h>
+#include "../blkif.h"
+
+#ifndef NDEBUG
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct blkif_st {
+    /* Unique identifier for this interface. */
+    domid_t          domid;
+    unsigned int     handle;
+    /* Physical parameters of the comms window. */
+    unsigned long    shmem_frame;
+    unsigned int     evtchn;
+    int              irq;
+    /* Comms information. */
+    blkif_ring_t    *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+    BLK_RING_IDX     blk_req_cons;  /* Request consumer. */
+    BLK_RING_IDX     blk_resp_prod; /* Private version of response producer. */
+    /* VBDs attached to this interface. */
+    rb_root_t        vbd_rb;        /* Mapping from 16-bit vdevices to VBDs. */
+    spinlock_t       vbd_lock;      /* Protects VBD mapping. */
+    /* Private fields. */
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    /*
+     * DISCONNECT response is deferred until pending requests are ack'ed.
+     * We therefore need to store the id from the original request.
+     */
+    u8               disconnect_rspid;
+    struct blkif_st *hash_next;
+    struct list_head blkdev_list;
+    spinlock_t       blk_ring_lock;
+    atomic_t         refcnt;
+} blkif_t;
+
+void blkif_create(blkif_be_create_t *create);
+void blkif_destroy(blkif_be_destroy_t *destroy);
+void blkif_connect(blkif_be_connect_t *connect);
+int  blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id);
+void __blkif_disconnect_complete(blkif_t *blkif);
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle);
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            __blkif_disconnect_complete(_b);      \
+    } while (0)
+
+/* An entry in a list of xen_extents. */
+typedef struct _blkif_extent_le { 
+    blkif_extent_t extent;               /* an individual extent */
+    struct _blkif_extent_le *next;       /* and a pointer to the next */ 
+} blkif_extent_le_t; 
+
+typedef struct _vbd { 
+    blkif_vdev_t       vdevice;   /* what the domain refers to this vbd as */
+    unsigned char      readonly;  /* Non-zero -> read-only */
+    unsigned char      type;      /* XD_TYPE_xxx */
+    blkif_extent_le_t *extents;   /* list of xen_extents making up this vbd */
+    rb_node_t          rb;        /* for linking into R-B tree lookup struct */
+} vbd_t; 
+
+void vbd_create(blkif_be_vbd_create_t *create); 
+void vbd_grow(blkif_be_vbd_grow_t *grow); 
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink);
+void vbd_destroy(blkif_be_vbd_destroy_t *delete); 
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds);
+void destroy_all_vbds(blkif_t *blkif);
+
+/* Describes a [partial] disk extent (part of a block io request) */
+typedef struct {
+    unsigned short dev;
+    unsigned short nr_sects;
+    unsigned long  buffer;
+    xen_sector_t   sector_number;
+} phys_seg_t;
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); 
+
+void blkif_interface_init(void);
+void blkif_ctrlif_init(void);
+
+void blkif_deschedule(blkif_t *blkif);
+
+void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c
new file mode 100644
index 0000000000..0b26224651
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/control.c
+ * 
+ * Routines for interfacing with the control plane.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype);
+    
+    switch ( msg->subtype )
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        blkif_create((blkif_be_create_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_CONNECT:
+        if ( msg->length != sizeof(blkif_be_connect_t) )
+            goto parse_error;
+        blkif_connect((blkif_be_connect_t *)&msg->msg[0]);
+        break;        
+    case CMSG_BLKIF_BE_DISCONNECT:
+        if ( msg->length != sizeof(blkif_be_disconnect_t) )
+            goto parse_error;
+        if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) )
+            return; /* Sending the response is deferred until later. */
+        break;        
+    case CMSG_BLKIF_BE_VBD_CREATE:
+        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+            goto parse_error;
+        vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_DESTROY:
+        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+            goto parse_error;
+        vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_GROW:
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]);
+        break;
+    case CMSG_BLKIF_BE_VBD_SHRINK:
+        if ( msg->length != sizeof(blkif_be_vbd_shrink_t) )
+            goto parse_error;
+        vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]);
+        break;
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    DPRINTK("Parse error while reading message subtype %d, len %d\n",
+            msg->subtype, msg->length);
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+void blkif_ctrlif_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    blkif_be_driver_status_changed_t st;
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_BE;
+    cmsg.subtype   = CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(blkif_be_driver_status_changed_t);
+    st.status      = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c
new file mode 100644
index 0000000000..780d793c6c
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+static kmem_cache_t *blkif_cachep;
+static blkif_t      *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif != NULL) && 
+            ((blkif->domid != domid) || (blkif->handle != handle)) )
+        blkif = blkif->hash_next;
+    return blkif;
+}
+
+void __blkif_disconnect_complete(blkif_t *blkif)
+{
+    ctrl_msg_t            cmsg;
+    blkif_be_disconnect_t disc;
+
+    /*
+     * These can't be done in __blkif_disconnect() because at that point there
+     * may be outstanding requests at the disc whose asynchronous responses
+     * must still be notified to the remote driver.
+     */
+    unbind_evtchn_from_irq(blkif->evtchn);
+    vfree(blkif->blk_ring_base);
+
+    /* Construct the deferred response message. */
+    cmsg.type         = CMSG_BLKIF_BE;
+    cmsg.subtype      = CMSG_BLKIF_BE_DISCONNECT;
+    cmsg.id           = blkif->disconnect_rspid;
+    cmsg.length       = sizeof(blkif_be_disconnect_t);
+    disc.domid        = blkif->domid;
+    disc.blkif_handle = blkif->handle;
+    disc.status       = BLKIF_BE_STATUS_OKAY;
+    memcpy(cmsg.msg, &disc, sizeof(disc));
+
+    /*
+     * Make sure message is constructed /before/ status change, because
+     * after the status change the 'blkif' structure could be deallocated at
+     * any time. Also make sure we send the response /after/ status change,
+     * as otherwise a subsequent CONNECT request could spuriously fail if
+     * another CPU doesn't see the status change yet.
+     */
+    mb();
+    if ( blkif->status != DISCONNECTING )
+        BUG();
+    blkif->status = DISCONNECTED;
+    mb();
+
+    /* Send the successful response. */
+    ctrl_if_send_response(&cmsg);
+}
+
+void blkif_create(blkif_be_create_t *create)
+{
+    domid_t       domid  = create->domid;
+    unsigned int  handle = create->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
+    {
+        DPRINTK("Could not create blkif: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid  = domid;
+    blkif->handle = handle;
+    blkif->status = DISCONNECTED;
+    spin_lock_init(&blkif->vbd_lock);
+    spin_lock_init(&blkif->blk_ring_lock);
+    atomic_set(&blkif->refcnt, 0);
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( *pblkif != NULL )
+    {
+        if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+        {
+            DPRINTK("Could not create blkif: already exists\n");
+            create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+            kmem_cache_free(blkif_cachep, blkif);
+            return;
+        }
+        pblkif = &(*pblkif)->hash_next;
+    }
+
+    blkif->hash_next = *pblkif;
+    *pblkif = blkif;
+
+    DPRINTK("Successfully created blkif\n");
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->blkif_handle;
+    blkif_t     **pblkif, *blkif;
+
+    pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+    while ( (blkif = *pblkif) != NULL )
+    {
+        if ( (blkif->domid == domid) && (blkif->handle == handle) )
+        {
+            if ( blkif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pblkif = &blkif->hash_next;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pblkif = blkif->hash_next;
+    destroy_all_vbds(blkif);
+    kmem_cache_free(blkif_cachep, blkif);
+    destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_connect(blkif_be_connect_t *connect)
+{
+    domid_t       domid  = connect->domid;
+    unsigned int  handle = connect->blkif_handle;
+    unsigned int  evtchn = connect->evtchn;
+    unsigned long shmem_frame = connect->shmem_frame;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
+    blkif_t      *blkif;
+
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", 
+                connect->domid, connect->blkif_handle); 
+        connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
+                                    shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                    prot, domid);
+    if ( error != 0 )
+    {
+        if ( error == -ENOMEM )
+            connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT )
+            connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+        else
+            connect->status = BLKIF_BE_STATUS_ERROR;
+        vfree(vma->addr);
+        return;
+    }
+
+    if ( blkif->status != DISCONNECTED )
+    {
+        connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+        vfree(vma->addr);
+        return;
+    }
+
+    blkif->evtchn        = evtchn;
+    blkif->irq           = bind_evtchn_to_irq(evtchn);
+    blkif->shmem_frame   = shmem_frame;
+    blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
+    blkif->status        = CONNECTED;
+    blkif_get(blkif);
+
+    request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif);
+
+    connect->status = BLKIF_BE_STATUS_OKAY;
+}
+
+int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+    domid_t       domid  = disconnect->domid;
+    unsigned int  handle = disconnect->blkif_handle;
+    blkif_t      *blkif;
+
+    blkif = blkif_find_by_handle(domid, handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("blkif_disconnect attempted for non-existent blkif"
+                " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); 
+        disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return 1; /* Caller will send response error message. */
+    }
+
+    if ( blkif->status == CONNECTED )
+    {
+        blkif->status = DISCONNECTING;
+        blkif->disconnect_rspid = rsp_id;
+        wmb(); /* Let other CPUs see the status change. */
+        free_irq(blkif->irq, NULL);
+        blkif_deschedule(blkif);
+        blkif_put(blkif);
+    }
+
+    return 0; /* Caller should not send response message. */
+}
+
+void __init blkif_interface_init(void)
+{
+    blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+                                     0, 0, NULL, NULL);
+    memset(blkif_hash, 0, sizeof(blkif_hash));
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c
new file mode 100644
index 0000000000..803af976d2
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c
@@ -0,0 +1,523 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ * 
+ * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+
+/*
+ * NB. We place a page of padding between each buffer page to avoid incorrect
+ * merging of requests by the IDE and SCSI merging routines. Otherwise, two
+ * adjacent buffers in a scatter-gather request would have adjacent page
+ * numbers: since the merge routines don't realise that this is in *pseudophys*
+ * space, not real space, they may collapse the s-g elements!
+ */
+static unsigned long mmap_vstart;
+#define MMAP_PAGES_PER_REQUEST \
+    (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1))
+#define MMAP_PAGES             \
+    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg)                        \
+    (mmap_vstart +                                   \
+     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+     ((_seg) * 2 * PAGE_SIZE))
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+    blkif_t       *blkif;
+    unsigned long  id;
+    int            nr_pages;
+    atomic_t       pendcnt;
+    unsigned short operation;
+    int            status;
+} pending_req_t;
+
+/*
+ * We can't allocate pending_req's in order, since they may complete out of 
+ * order. We therefore maintain an allocation ring. This ring also indicates 
+ * when enough work has been passed down -- at that point the allocation ring 
+ * will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+/* NB. We use a different index type to differentiate from shared blk rings. */
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+static kmem_cache_t *buffer_head_cachep;
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do);
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st);
+
+static void fast_flush_area(int idx, int nr_pages)
+{
+    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+    int               i;
+
+    for ( i = 0; i < nr_pages; i++ )
+    {
+        mcl[i].op = __HYPERVISOR_update_va_mapping;
+        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
+        mcl[i].args[1] = 0;
+        mcl[i].args[2] = 0;
+    }
+
+    mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB;
+    (void)HYPERVISOR_multicall(mcl, nr_pages);
+}
+
+
+/******************************************************************
+ * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
+ */
+
+static struct list_head io_schedule_list;
+static spinlock_t io_schedule_list_lock;
+
+static int __on_blkdev_list(blkif_t *blkif)
+{
+    return blkif->blkdev_list.next != NULL;
+}
+
+static void remove_from_blkdev_list(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( !__on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&io_schedule_list_lock, flags);
+    if ( __on_blkdev_list(blkif) )
+    {
+        list_del(&blkif->blkdev_list);
+        blkif->blkdev_list.next = NULL;
+        blkif_put(blkif);
+    }
+    spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+static void add_to_blkdev_list_tail(blkif_t *blkif)
+{
+    unsigned long flags;
+    if ( __on_blkdev_list(blkif) ) return;
+    spin_lock_irqsave(&io_schedule_list_lock, flags);
+    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
+    {
+        list_add_tail(&blkif->blkdev_list, &io_schedule_list);
+        blkif_get(blkif);
+    }
+    spin_unlock_irqrestore(&io_schedule_list_lock, flags);
+}
+
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void io_schedule(unsigned long unused)
+{
+    blkif_t          *blkif;
+    struct list_head *ent;
+
+    /* Queue up a batch of requests. */
+    while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+            !list_empty(&io_schedule_list) )
+    {
+        ent = io_schedule_list.next;
+        blkif = list_entry(ent, blkif_t, blkdev_list);
+        blkif_get(blkif);
+        remove_from_blkdev_list(blkif);
+        if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
+            add_to_blkdev_list_tail(blkif);
+        blkif_put(blkif);
+    }
+
+    /* Push the batch through to disc. */
+    run_task_queue(&tq_disk);
+}
+
+static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);
+
+static void maybe_trigger_io_schedule(void)
+{
+    /*
+     * Needed so that two processes, who together make the following predicate
+     * true, don't both read stale values and evaluate the predicate
+     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
+     */
+    smp_mb();
+
+    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+         !list_empty(&io_schedule_list) )
+        tasklet_schedule(&io_schedule_tasklet);
+}
+
+
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
+{
+    unsigned long flags;
+
+    /* An error fails the entire request. */
+    if ( !uptodate )
+    {
+        DPRINTK("Buffer not up-to-date at end of operation\n");
+        pending_req->status = BLKIF_RSP_ERROR;
+    }
+
+    if ( atomic_dec_and_test(&pending_req->pendcnt) )
+    {
+        int pending_idx = pending_req - pending_reqs;
+        fast_flush_area(pending_idx, pending_req->nr_pages);
+        make_response(pending_req->blkif, pending_req->id,
+                      pending_req->operation, pending_req->status);
+        blkif_put(pending_req->blkif);
+        spin_lock_irqsave(&pend_prod_lock, flags);
+        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+        spin_unlock_irqrestore(&pend_prod_lock, flags);
+        maybe_trigger_io_schedule();
+    }
+}
+
+static void end_block_io_op(struct buffer_head *bh, int uptodate)
+{
+    __end_block_io_op(bh->b_private, uptodate);
+    kmem_cache_free(buffer_head_cachep, bh);
+}
+
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    blkif_t *blkif = dev_id;
+    add_to_blkdev_list_tail(blkif);
+    maybe_trigger_io_schedule();
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int do_block_io_op(blkif_t *blkif, int max_to_do)
+{
+    blkif_ring_t *blk_ring = blkif->blk_ring_base;
+    blkif_request_t *req;
+    BLK_RING_IDX i;
+    int more_to_do = 0;
+
+    /* Take items off the comms ring, taking care not to overflow. */
+    for ( i = blkif->blk_req_cons; 
+          (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != 
+                                        BLK_RING_SIZE);
+          i++ )
+    {
+        if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
+        {
+            more_to_do = 1;
+            break;
+        }
+        
+        req = &blk_ring->ring[MASK_BLK_IDX(i)].req;
+        switch ( req->operation )
+        {
+        case BLKIF_OP_READ:
+        case BLKIF_OP_WRITE:
+            dispatch_rw_block_io(blkif, req);
+            break;
+
+        case BLKIF_OP_PROBE:
+            dispatch_probe(blkif, req);
+            break;
+
+        default:
+            DPRINTK("error: unknown block io operation [%d]\n",
+                    blk_ring->ring[i].req.operation);
+            make_response(blkif, blk_ring->ring[i].req.id, 
+                          blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
+            break;
+        }
+    }
+
+    blkif->blk_req_cons = i;
+    return more_to_do;
+}
+
+static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
+{
+    int rsp = BLKIF_RSP_ERROR;
+    int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+    /* We expect one buffer only. */
+    if ( unlikely(req->nr_segments != 1) )
+        goto out;
+
+    /* Make sure the buffer is page-sized. */
+    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+         (blkif_last_sect(req->frame_and_sects[0]) != 7) )
+        goto out;
+
+    if ( HYPERVISOR_update_va_mapping_otherdomain(
+        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
+        (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
+        0, blkif->domid) )
+        goto out;
+
+    rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
+                    PAGE_SIZE / sizeof(vdisk_t));
+
+ out:
+    fast_flush_area(pending_idx, 1);
+    make_response(blkif, req->id, req->operation, rsp);
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+{
+    extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
+    struct buffer_head *bh;
+    int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
+    short nr_sects;
+    unsigned long buffer, fas;
+    int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+    pending_req_t *pending_req;
+    unsigned long  remap_prot;
+    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
+
+    /* We map virtual scatter/gather segments to physical segments. */
+    int new_segs, nr_psegs = 0;
+    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
+
+    /* Check that number of segments is sane. */
+    if ( unlikely(req->nr_segments == 0) || 
+         unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
+    {
+        DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
+        goto bad_descriptor;
+    }
+
+    /*
+     * Check each address/size pair is sane, and convert into a
+     * physical device and block offset. Note that if the offset and size
+     * crosses a virtual extent boundary, we may end up with more
+     * physical scatter/gather segments than virtual segments.
+     */
+    for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
+    {
+        fas      = req->frame_and_sects[i];
+        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
+        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
+
+        if ( nr_sects <= 0 )
+            goto bad_descriptor;
+
+        phys_seg[nr_psegs].dev           = req->device;
+        phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
+        phys_seg[nr_psegs].buffer        = buffer;
+        phys_seg[nr_psegs].nr_sects      = nr_sects;
+
+        /* Translate the request into the relevant 'physical device' */
+        new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
+        if ( new_segs < 0 )
+        { 
+            DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
+                    operation == READ ? "read" : "write", 
+                    req->sector_number + tot_sects, 
+                    req->sector_number + tot_sects + nr_sects, 
+                    req->device); 
+            goto bad_descriptor;
+        }
+  
+        nr_psegs += new_segs;
+        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
+    }
+
+    /* Nonsensical zero-sized request? */
+    if ( unlikely(nr_psegs == 0) )
+        goto bad_descriptor;
+
+    if ( operation == READ )
+        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
+    else
+        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;
+
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
+        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
+        mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
+        mcl[i].args[2] = 0;
+        mcl[i].args[3] = blkif->domid;
+
+        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
+            phys_seg[i].buffer >> PAGE_SHIFT;
+    }
+
+    (void)HYPERVISOR_multicall(mcl, nr_psegs);
+
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        if ( unlikely(mcl[i].args[5] != 0) )
+        {
+            DPRINTK("invalid buffer -- could not remap it\n");
+            fast_flush_area(pending_idx, nr_psegs);
+            goto bad_descriptor;
+        }
+    }
+
+    pending_req = &pending_reqs[pending_idx];
+    pending_req->blkif     = blkif;
+    pending_req->id        = req->id;
+    pending_req->operation = operation;
+    pending_req->status    = BLKIF_RSP_OKAY;
+    pending_req->nr_pages  = nr_psegs;
+    atomic_set(&pending_req->pendcnt, nr_psegs);
+    pending_cons++;
+
+    blkif_get(blkif);
+
+    /* Now we pass each segment down to the real blkdev layer. */
+    for ( i = 0; i < nr_psegs; i++ )
+    {
+        bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
+        if ( unlikely(bh == NULL) )
+        {
+            __end_block_io_op(pending_req, 0);
+            continue;
+        }
+        memset(bh, 0, sizeof (struct buffer_head));
+
+        init_waitqueue_head(&bh->b_wait);
+        bh->b_size          = phys_seg[i].nr_sects << 9;
+        bh->b_dev           = phys_seg[i].dev;
+        bh->b_rdev          = phys_seg[i].dev;
+        bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
+        bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
+            (phys_seg[i].buffer & ~PAGE_MASK);
+        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
+        bh->b_end_io        = end_block_io_op;
+        bh->b_private       = pending_req;
+
+        bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | 
+            (1 << BH_Req) | (1 << BH_Launder);
+        if ( operation == WRITE )
+            bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);
+
+        atomic_set(&bh->b_count, 1);
+
+        /* Dispatch a single request. We'll flush it to disc later. */
+        generic_make_request(operation, bh);
+    }
+
+    return;
+
+ bad_descriptor:
+    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+} 
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, unsigned long id, 
+                          unsigned short op, int st)
+{
+    blkif_response_t *resp;
+    unsigned long     flags;
+
+    /* Place on the response ring for the relevant domain. */ 
+    spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+    resp = &blkif->blk_ring_base->
+        ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp;
+    resp->id        = id;
+    resp->operation = op;
+    resp->status    = st;
+    wmb();
+    blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
+    spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+    /* Kick the relevant domain. */
+    notify_via_evtchn(blkif->evtchn);
+}
+
+void blkif_deschedule(blkif_t *blkif)
+{
+    remove_from_blkdev_list(blkif);
+}
+
+static int __init init_module(void)
+{
+    int i;
+
+    if ( !(start_info.flags & SIF_INITDOMAIN)
+	 && !(start_info.flags & SIF_BLK_BE_DOMAIN) )
+        return 0;
+
+    blkif_interface_init();
+
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+        BUG();
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    memset(pending_reqs, 0, sizeof(pending_reqs));
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+    
+    spin_lock_init(&io_schedule_list_lock);
+    INIT_LIST_HEAD(&io_schedule_list);
+
+    buffer_head_cachep = kmem_cache_create(
+        "buffer_head_cache", sizeof(struct buffer_head),
+        0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+    blkif_ctrlif_init();
+
+    return 0;
+}
+
+static void cleanup_module(void)
+{
+    BUG();
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c
new file mode 100644
index 0000000000..6704fbb541
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c
@@ -0,0 +1,436 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/vbd.c
+ * 
+ * Routines for managing virtual block devices (VBDs).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ */
+
+#include "common.h"
+
+void vbd_create(blkif_be_vbd_create_t *create) 
+{
+    vbd_t       *vbd; 
+    rb_node_t  **rb_p, *rb_parent = NULL;
+    blkif_t     *blkif;
+    blkif_vdev_t vdevice = create->vdevice;
+
+    blkif = blkif_find_by_handle(create->domid, create->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", 
+                create->domid, create->blkif_handle); 
+        create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb_p = &blkif->vbd_rb.rb_node;
+    while ( *rb_p != NULL )
+    {
+        rb_parent = *rb_p;
+        vbd = rb_entry(rb_parent, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+        {
+            rb_p = &rb_parent->rb_left;
+        }
+        else if ( vdevice > vbd->vdevice )
+        {
+            rb_p = &rb_parent->rb_right;
+        }
+        else
+        {
+            DPRINTK("vbd_create attempted for already existing vbd\n");
+            create->status = BLKIF_BE_STATUS_VBD_EXISTS;
+            goto out;
+        }
+    }
+
+    if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
+    {
+        DPRINTK("vbd_create: out of memory\n");
+        create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        goto out;
+    }
+
+    vbd->vdevice  = vdevice; 
+    vbd->readonly = create->readonly;
+    vbd->type     = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+    vbd->extents  = NULL; 
+
+    rb_link_node(&vbd->rb, rb_parent, rb_p);
+    rb_insert_color(&vbd->rb, &blkif->vbd_rb);
+
+    DPRINTK("Successful creation of vdev=%04x (dom=%u)\n",
+            vdevice, create->domid);
+    create->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */
+void vbd_grow(blkif_be_vbd_grow_t *grow) 
+{
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    rb_node_t          *rb;
+    blkif_vdev_t        vdevice = grow->vdevice;
+
+    blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_grow attempted for non-existent blkif (%u,%u)\n", 
+                grow->domid, grow->blkif_handle); 
+        grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            break;
+    }
+
+    if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+    {
+        DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n");
+        grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        goto out;
+    } 
+
+    if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), 
+                               GFP_KERNEL)) == NULL) )
+    {
+        DPRINTK("vbd_grow: out of memory\n");
+        grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        goto out;
+    }
+ 
+    x->extent.device        = grow->extent.device; 
+    x->extent.sector_start  = grow->extent.sector_start; 
+    x->extent.sector_length = grow->extent.sector_length; 
+    x->next                 = (blkif_extent_le_t *)NULL; 
+
+    for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) 
+        continue;
+
+    *px = x;
+
+    DPRINTK("Successful grow of vdev=%04x (dom=%u)\n",
+            vdevice, grow->domid);
+    grow->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_shrink(blkif_be_vbd_shrink_t *shrink)
+{
+    blkif_t            *blkif;
+    blkif_extent_le_t **px, *x; 
+    vbd_t              *vbd = NULL;
+    rb_node_t          *rb;
+    blkif_vdev_t        vdevice = shrink->vdevice;
+
+    blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_shrink attempted for non-existent blkif (%u,%u)\n", 
+                shrink->domid, shrink->blkif_handle); 
+        shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            break;
+    }
+
+    if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) )
+    {
+        shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+        goto out;
+    }
+
+    if ( unlikely(vbd->extents == NULL) )
+    {
+        shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND;
+        goto out;
+    }
+
+    /* Find the last extent. We now know that there is at least one. */
+    for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next )
+        continue;
+
+    x   = *px;
+    *px = x->next;
+    kfree(x);
+
+    shrink->status = BLKIF_BE_STATUS_OKAY;
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void vbd_destroy(blkif_be_vbd_destroy_t *destroy) 
+{
+    blkif_t           *blkif;
+    vbd_t             *vbd;
+    rb_node_t         *rb;
+    blkif_extent_le_t *x, *t;
+    blkif_vdev_t       vdevice = destroy->vdevice;
+
+    blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle);
+    if ( unlikely(blkif == NULL) )
+    {
+        DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", 
+                destroy->domid, destroy->blkif_handle); 
+        destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( vdevice < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( vdevice > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            goto found;
+    }
+
+    destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+    goto out;
+
+ found:
+    rb_erase(rb, &blkif->vbd_rb);
+    x = vbd->extents;
+    kfree(vbd);
+
+    while ( x != NULL )
+    {
+        t = x->next;
+        kfree(x);
+        x = t;
+    }
+    
+ out:
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+void destroy_all_vbds(blkif_t *blkif)
+{
+    vbd_t *vbd;
+    rb_node_t *rb;
+    blkif_extent_le_t *x, *t;
+
+    spin_lock(&blkif->vbd_lock);
+
+    while ( (rb = blkif->vbd_rb.rb_node) != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+
+        rb_erase(rb, &blkif->vbd_rb);
+        x = vbd->extents;
+        kfree(vbd);
+        
+        while ( x != NULL )
+        {
+            t = x->next;
+            kfree(x);
+            x = t;
+        }          
+    }
+
+    spin_unlock(&blkif->vbd_lock);
+}
+
+
+static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd)
+{
+    blkif_extent_le_t *x; 
+
+    vbd_info->device = vbd->vdevice; 
+    vbd_info->info   = vbd->type;
+    if ( vbd->readonly )
+        vbd_info->info |= VDISK_FLAG_RO; 
+    vbd_info->capacity = 0ULL;
+    for ( x = vbd->extents; x != NULL; x = x->next )
+        vbd_info->capacity += x->extent.sector_length; 
+        
+    return 0;
+}
+
+
+int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds)
+{
+    int rc = 0, nr_vbds = 0;
+    rb_node_t *rb;
+
+    spin_lock(&blkif->vbd_lock);
+
+    if ( (rb = blkif->vbd_rb.rb_node) == NULL )
+        goto out;
+
+ new_subtree:
+    /* STEP 1. Find least node (it'll be left-most). */
+    while ( rb->rb_left != NULL )
+        rb = rb->rb_left;
+
+    for ( ; ; )
+    {
+        /* STEP 2. Dealt with left subtree. Now process current node. */
+        if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], 
+                                    rb_entry(rb, vbd_t, rb))) != 0 )
+            goto out;
+        if ( ++nr_vbds == max_vbds )
+            goto out;
+
+        /* STEP 3. Process right subtree, if any. */
+        if ( rb->rb_right != NULL )
+        {
+            rb = rb->rb_right;
+            goto new_subtree;
+        }
+
+        /* STEP 4. Done both subtrees. Head back through ancesstors. */
+        for ( ; ; ) 
+        {
+            /* We're done when we get back to the root node. */
+            if ( rb->rb_parent == NULL )
+                goto out;
+            /* If we are left of parent, then parent is next to process. */
+            if ( rb->rb_parent->rb_left == rb )
+                break;
+            /* If we are right of parent, then we climb to grandparent. */
+            rb = rb->rb_parent;
+        }
+
+        rb = rb->rb_parent;
+    }
+
+ out:
+    spin_unlock(&blkif->vbd_lock);
+    return (rc == 0) ? nr_vbds : rc;  
+}
+
+
+int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation)
+{
+    blkif_extent_le_t *x; 
+    vbd_t             *vbd;
+    rb_node_t         *rb;
+    blkif_sector_t     sec_off;
+    unsigned long      nr_secs;
+
+    spin_lock(&blkif->vbd_lock);
+
+    rb = blkif->vbd_rb.rb_node;
+    while ( rb != NULL )
+    {
+        vbd = rb_entry(rb, vbd_t, rb);
+        if ( pseg->dev < vbd->vdevice )
+            rb = rb->rb_left;
+        else if ( pseg->dev > vbd->vdevice )
+            rb = rb->rb_right;
+        else
+            goto found;
+    }
+
+    DPRINTK("vbd_translate; domain %u attempted to access "
+            "non-existent VBD.\n", blkif->domid);
+
+    spin_unlock(&blkif->vbd_lock);
+    return -ENODEV; 
+
+ found:
+
+    if ( (operation == WRITE) && vbd->readonly )
+    {
+        spin_unlock(&blkif->vbd_lock);
+        return -EACCES; 
+    }
+
+    /*
+     * Now iterate through the list of blkif_extents, working out which should 
+     * be used to perform the translation.
+     */
+    sec_off = pseg->sector_number; 
+    nr_secs = pseg->nr_sects;
+    for ( x = vbd->extents; x != NULL; x = x->next )
+    { 
+        if ( sec_off < x->extent.sector_length )
+        {
+            pseg->dev = x->extent.device; 
+            pseg->sector_number = x->extent.sector_start + sec_off;
+            if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) )
+                goto overrun;
+            spin_unlock(&p->vbd_lock);
+            return 1;
+        } 
+        sec_off -= x->extent.sector_length; 
+    }
+
+    DPRINTK("vbd_translate: end of vbd.\n");
+    spin_unlock(&blkif->vbd_lock);
+    return -EACCES; 
+
+    /*
+     * Here we deal with overrun onto the following extent. We don't deal with 
+     * overrun of more than one boundary since each request is restricted to 
+     * 2^9 512-byte sectors, so it should be trivial for control software to 
+     * ensure that extents are large enough to prevent excessive overrun.
+     */
+ overrun:
+
+    /* Adjust length of first chunk to run to end of first extent. */
+    pseg[0].nr_sects = x->extent.sector_length - sec_off;
+
+    /* Set second chunk buffer and length to start where first chunk ended. */
+    pseg[1].buffer   = pseg[0].buffer + (pseg[0].nr_sects << 9);
+    pseg[1].nr_sects = nr_secs - pseg[0].nr_sects;
+
+    /* Now move to the next extent. Check it exists and is long enough! */
+    if ( unlikely((x = x->next) == NULL) || 
+         unlikely(x->extent.sector_length < pseg[1].nr_sects) )
+    {
+        DPRINTK("vbd_translate: multiple overruns or end of vbd.\n");
+        spin_unlock(&p->vbd_lock);
+        return -EACCES;
+    }
+
+    /* Store the real device and start sector for the second chunk. */
+    pseg[1].dev           = x->extent.device;
+    pseg[1].sector_number = x->extent.sector_start;
+    
+    spin_unlock(&blkif->vbd_lock);
+    return 2;
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/blkif.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/blkif.h
new file mode 100644
index 0000000000..1024629ea7
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/blkif.h
@@ -0,0 +1,115 @@
+/******************************************************************************
+ * blkif.h
+ * 
+ * Unified block-device I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __SHARED_BLKIF_H__
+#define __SHARED_BLKIF_H__
+
+#define blkif_vdev_t   u16
+#define blkif_sector_t u64
+
+#define BLKIF_OP_READ      0
+#define BLKIF_OP_WRITE     1
+#define BLKIF_OP_PROBE     2
+
+/* NB. Ring size must be small enough for sizeof(blkif_ring_t) <= PAGE_SIZE. */
+#define BLKIF_RING_SIZE        64
+
+/*
+ * Maximum scatter/gather segments per request.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
+ */
+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+
+typedef struct {
+    u8             operation;    /*  0: BLKIF_OP_???                         */
+    u8             nr_segments;  /*  1: number of segments                   */
+    blkif_vdev_t   device;       /*  2: only for read/write requests         */
+    unsigned long  id;           /*  4: private guest value, echoed in resp  */
+    blkif_sector_t sector_number;    /* start sector idx on disk (r/w only)  */
+    /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame.   */
+    /* @first_sect: first sector in frame to transfer (inclusive).           */
+    /* @last_sect: last sector in frame to transfer (inclusive).             */
+    /* @frame: machine page frame number.                                    */
+    unsigned long  frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} PACKED blkif_request_t;
+
+#define blkif_first_sect(_fas) (((_fas)>>3)&7)
+#define blkif_last_sect(_fas)  ((_fas)&7)
+
+typedef struct {
+    unsigned long   id;              /* copied from request */
+    u8              operation;       /* copied from request */
+    s16             status;          /* BLKIF_RSP_???       */
+} PACKED blkif_response_t;
+
+#define BLKIF_RSP_ERROR  -1 /* non-specific 'error' */
+#define BLKIF_RSP_OKAY    0 /* non-specific 'okay'  */
+
+/*
+ * We use a special capitalised type name because it is _essential_ that all 
+ * arithmetic on indexes is done on an integer type of the correct size.
+ */
+typedef u32 BLKIF_RING_IDX;
+
+/*
+ * Ring indexes are 'free running'. That is, they are not stored modulo the
+ * size of the ring buffer. The following macro converts a free-running counter
+ * into a value that can directly index a ring-buffer array.
+ */
+#define MASK_BLKIF_IDX(_i) ((_i)&(BLKIF_RING_SIZE-1))
+
+typedef struct {
+    BLKIF_RING_IDX req_prod;  /*  0: Request producer. Updated by front-end. */
+    BLKIF_RING_IDX resp_prod; /*  4: Response producer. Updated by back-end. */
+    union {                   /*  8 */
+        blkif_request_t  req;
+        blkif_response_t resp;
+    } PACKED ring[BLKIF_RING_SIZE];
+} PACKED blkif_ring_t;
+
+
+/*
+ * BLKIF_OP_PROBE:
+ * The request format for a probe request is constrained as follows:
+ *  @operation   == BLKIF_OP_PROBE
+ *  @nr_segments == size of probe buffer in pages
+ *  @device      == unused (zero)
+ *  @id          == any value (echoed in response message)
+ *  @sector_num  == unused (zero)
+ *  @frame_and_sects == list of page-sized buffers.
+ *                       (i.e., @first_sect == 0, @last_sect == 7).
+ * 
+ * The response is a list of vdisk_t elements copied into the out-of-band
+ * probe buffer. On success the response status field contains the number
+ * of vdisk_t elements.
+ */
+
+/* XXX SMH: Type values below are chosen to match ide_xxx in Linux ide.h. */
+#define VDISK_TYPE_FLOPPY  0x00
+#define VDISK_TYPE_TAPE    0x01
+#define VDISK_TYPE_CDROM   0x05
+#define VDISK_TYPE_OPTICAL 0x07
+#define VDISK_TYPE_DISK    0x20 
+
+#define VDISK_TYPE_MASK    0x3F
+#define VDISK_TYPE(_x)     ((_x) & VDISK_TYPE_MASK) 
+
+/* The top two bits of the type field encode various flags. */
+#define VDISK_FLAG_RO      0x40
+#define VDISK_FLAG_VIRT    0x80
+#define VDISK_READONLY(_x) ((_x) & VDISK_FLAG_RO)
+#define VDISK_VIRTUAL(_x)  ((_x) & VDISK_FLAG_VIRT) 
+
+typedef struct {
+    blkif_sector_t capacity;     /*  0: Size in terms of 512-byte sectors.   */
+    blkif_vdev_t   device;       /*  8: Device number (opaque 16 bit value). */
+    u16            info;         /* 10: Device type and flags (VDISK_*).     */
+} PACKED vdisk_t; /* 12 bytes */
+
+#endif /* __SHARED_BLKIF_H__ */
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile
new file mode 100644
index 0000000000..b0d27cf698
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := main.o vbd.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/common.h
new file mode 100644
index 0000000000..2d4415bdef
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/common.h
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/frontend/common.h
+ * 
+ * Shared definitions between all levels of XenoLinux Virtual block devices.
+ */
+
+#ifndef __XEN_DRIVERS_COMMON_H__
+#define __XEN_DRIVERS_COMMON_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+
+#include <asm/hypervisor-ifs/hypervisor-if.h>
+#include <asm/hypervisor-ifs/vbd.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#include "../blkif.h"
+
+#if 0
+#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+/* Private gendisk->flags[] values. */
+#define GENHD_FL_XEN        2 /* Is unit a Xen block device?  */
+#define GENHD_FL_VIRT_PARTNS 4 /* Are unit partitions virtual? */
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.
+ * They hang in an array off the gendisk structure. We may end up putting
+ * all kinds of interesting stuff here :-)
+ */
+typedef struct xl_disk {
+    int usage;
+} xl_disk_t;
+
+extern int blkif_open(struct inode *inode, struct file *filep);
+extern int blkif_release(struct inode *inode, struct file *filep);
+extern int blkif_ioctl(struct inode *inode, struct file *filep,
+                                 unsigned command, unsigned long argument);
+extern int blkif_check(kdev_t dev);
+extern int blkif_revalidate(kdev_t dev);
+extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp);
+extern void do_blkif_request (request_queue_t *rq); 
+
+extern void xlvbd_update_vbds(void);
+
+static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev)
+{
+    struct gendisk *gd = get_gendisk(xldev);
+    
+    if ( gd == NULL ) 
+        return NULL;
+    
+    return (xl_disk_t *)gd->real_devices + 
+        (MINOR(xldev) >> gd->minor_shift);
+}
+
+
+/* Virtual block-device subsystem. */
+extern int  xlvbd_init(void);
+extern void xlvbd_cleanup(void); 
+
+#endif /* __XEN_DRIVERS_COMMON_H__ */
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c
new file mode 100644
index 0000000000..4e5dbca093
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c
@@ -0,0 +1,814 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/frontend/main.c
+ * 
+ * Xenolinux virtual block-device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ */
+
+#include "common.h"
+#include <linux/blk.h>
+#include <linux/cdrom.h>
+#include <linux/tqueue.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <scsi/scsi.h>
+#include <asm/ctrl_if.h>
+
+
+
+typedef unsigned char byte; /* from linux/ide.h */
+
+#define BLKIF_STATE_CLOSED       0
+#define BLKIF_STATE_DISCONNECTED 1
+#define BLKIF_STATE_CONNECTED    2
+static unsigned int blkif_state = BLKIF_STATE_CLOSED;
+static unsigned int blkif_evtchn, blkif_irq;
+
+static int blkif_control_rsp_valid;
+static blkif_response_t blkif_control_rsp;
+
+static blkif_ring_t *blk_ring;
+static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLK_RING_IDX req_prod;  /* Private request producer.         */
+
+
+static blkif_ring_t *blk_ring_rec; /* Private copy of requests, used for
+                                    * recovery.  Responses not stored here. */
+static BLK_RING_IDX resp_cons_rec; /* Copy of response consumer, used for
+                                    * recovery */
+static int recovery = 0;           /* "Recovery in progress" flag.  Protected
+                                    * by the io_request_lock */
+
+
+/* We plug the I/O ring if the driver is suspended or if the ring is full. */
+#define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \
+                      (blkif_state != BLKIF_STATE_CONNECTED))
+
+
+/*
+ * Request queues with outstanding work, but ring is currently full.
+ * We need no special lock here, as we always access this with the
+ * io_request_lock held. We only need a small maximum list.
+ */
+#define MAX_PENDING 8
+static request_queue_t *pending_queues[MAX_PENDING];
+static int nr_pending;
+
+static kdev_t        sg_dev;
+static int           sg_operation = -1;
+static unsigned long sg_next_sect;
+#define DISABLE_SCATTERGATHER() (sg_operation = -1)
+
+static inline void flush_requests(void)
+{
+    DISABLE_SCATTERGATHER();
+    blk_ring->req_prod = req_prod;
+    notify_via_evtchn(blkif_evtchn);
+}
+
+
+/*
+ * blkif_update_int/update-vbds_task - handle VBD update events.
+ *  Schedule a task for keventd to run, which will update the VBDs and perform 
+ *  the corresponding updates to our view of VBD state.
+ */
+static struct tq_struct update_tq;
+static void update_vbds_task(void *unused)
+{ 
+    xlvbd_update_vbds();
+}
+
+
+int blkif_open(struct inode *inode, struct file *filep)
+{
+    short xldev = inode->i_rdev; 
+    struct gendisk *gd = get_gendisk(xldev);
+    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
+    short minor = MINOR(xldev); 
+
+    if ( gd->part[minor].nr_sects == 0 )
+    { 
+        /*
+         * Device either doesn't exist, or has zero capacity; we use a few
+         * cheesy heuristics to return the relevant error code
+         */
+        if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
+             ((minor & (gd->max_p - 1)) != 0) )
+        { 
+            /*
+             * We have a real device, but no such partition, or we just have a
+             * partition number so guess this is the problem.
+             */
+            return -ENXIO;     /* no such device or address */
+        }
+        else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
+        {
+            /* This is a removable device => assume that media is missing. */ 
+            return -ENOMEDIUM; /* media not present (this is a guess) */
+        } 
+        else
+        { 
+            /* Just go for the general 'no such device' error. */
+            return -ENODEV;    /* no such device */
+        }
+    }
+    
+    /* Update of usage count is protected by per-device semaphore. */
+    disk->usage++;
+
+    return 0;
+}
+
+
+int blkif_release(struct inode *inode, struct file *filep)
+{
+    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
+
+    /*
+     * When usage drops to zero it may allow more VBD updates to occur.
+     * Update of usage count is protected by a per-device semaphore.
+     */
+    if ( --disk->usage == 0 )
+    {
+#if 0
+        update_tq.routine = update_vbds_task;
+        schedule_task(&update_tq);
+#endif
+    }
+
+    return 0;
+}
+
+
+int blkif_ioctl(struct inode *inode, struct file *filep,
+                          unsigned command, unsigned long argument)
+{
+    kdev_t dev = inode->i_rdev;
+    struct hd_geometry *geo = (struct hd_geometry *)argument;
+    struct gendisk *gd;     
+    struct hd_struct *part; 
+    int i;
+
+    /* NB. No need to check permissions. That is done for us. */
+    
+    DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+                  command, (long) argument, dev); 
+  
+    gd = get_gendisk(dev);
+    part = &gd->part[MINOR(dev)]; 
+
+    switch ( command )
+    {
+    case BLKGETSIZE:
+        DPRINTK_IOCTL("   BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); 
+        return put_user(part->nr_sects, (unsigned long *) argument);
+
+    case BLKGETSIZE64:
+        DPRINTK_IOCTL("   BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
+                      (u64)part->nr_sects * 512);
+        return put_user((u64)part->nr_sects * 512, (u64 *) argument);
+
+    case BLKRRPART:                               /* re-read partition table */
+        DPRINTK_IOCTL("   BLKRRPART: %x\n", BLKRRPART);
+        return blkif_revalidate(dev);
+
+    case BLKSSZGET:
+        return hardsect_size[MAJOR(dev)][MINOR(dev)]; 
+
+    case BLKBSZGET:                                        /* get block size */
+        DPRINTK_IOCTL("   BLKBSZGET: %x\n", BLKBSZGET);
+        break;
+
+    case BLKBSZSET:                                        /* set block size */
+        DPRINTK_IOCTL("   BLKBSZSET: %x\n", BLKBSZSET);
+        break;
+
+    case BLKRASET:                                         /* set read-ahead */
+        DPRINTK_IOCTL("   BLKRASET: %x\n", BLKRASET);
+        break;
+
+    case BLKRAGET:                                         /* get read-ahead */
+        DPRINTK_IOCTL("   BLKRAFET: %x\n", BLKRAGET);
+        break;
+
+    case HDIO_GETGEO:
+        /* note: these values are complete garbage */
+        DPRINTK_IOCTL("   HDIO_GETGEO: %x\n", HDIO_GETGEO);
+        if (!argument) return -EINVAL;
+        if (put_user(0x00,  (unsigned long *) &geo->start)) return -EFAULT;
+        if (put_user(0xff,  (byte *)&geo->heads)) return -EFAULT;
+        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+        if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT;
+        return 0;
+
+    case HDIO_GETGEO_BIG: 
+        /* note: these values are complete garbage */
+        DPRINTK_IOCTL("   HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
+        if (!argument) return -EINVAL;
+        if (put_user(0x00,  (unsigned long *) &geo->start))  return -EFAULT;
+        if (put_user(0xff,  (byte *)&geo->heads))   return -EFAULT;
+        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+        if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT;
+        return 0;
+
+    case CDROMMULTISESSION:
+        DPRINTK("FIXME: support multisession CDs later\n");
+        for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
+            if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
+        return 0;
+
+    case SCSI_IOCTL_GET_BUS_NUMBER:
+        DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif");
+        return -ENOSYS;
+
+    default:
+        printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command);
+        return -ENOSYS;
+    }
+    
+    return 0;
+}
+
+/* check media change: should probably do something here in some cases :-) */
+int blkif_check(kdev_t dev)
+{
+    DPRINTK("blkif_check\n");
+    return 0;
+}
+
+int blkif_revalidate(kdev_t dev)
+{
+    struct block_device *bd;
+    struct gendisk *gd;
+    xl_disk_t *disk;
+    unsigned long capacity;
+    int i, rc = 0;
+    
+    if ( (bd = bdget(dev)) == NULL )
+        return -EINVAL;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((gd = get_gendisk(dev)) == NULL) ||
+         ((disk = xldev_to_xldisk(dev)) == NULL) ||
+         ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
+    {
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ( disk->usage > 1 )
+    {
+        rc = -EBUSY;
+        goto out;
+    }
+
+    /* Only reread partition table if VBDs aren't mapped to partitions. */
+    if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
+    {
+        for ( i = gd->max_p - 1; i >= 0; i-- )
+        {
+            invalidate_device(dev+i, 1);
+            gd->part[MINOR(dev+i)].start_sect = 0;
+            gd->part[MINOR(dev+i)].nr_sects   = 0;
+            gd->sizes[MINOR(dev+i)]           = 0;
+        }
+
+        grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);
+    return rc;
+}
+
+
+/*
+ * blkif_queue_request
+ *
+ * request block io 
+ * 
+ * id: for guest use only.
+ * operation: BLKIF_OP_{READ,WRITE,PROBE}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int blkif_queue_request(unsigned long   id,
+                               int             operation,
+                               char *          buffer,
+                               unsigned long   sector_number,
+                               unsigned short  nr_sectors,
+                               kdev_t          device)
+{
+    unsigned long       buffer_ma = phys_to_machine(virt_to_phys(buffer)); 
+    struct gendisk     *gd;
+    blkif_request_t    *req;
+    struct buffer_head *bh;
+    unsigned int        fsect, lsect;
+
+    fsect = (buffer_ma & ~PAGE_MASK) >> 9;
+    lsect = fsect + nr_sectors - 1;
+
+    /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
+    if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
+        BUG();
+    if ( lsect > 7 )
+        BUG();
+
+    buffer_ma &= PAGE_MASK;
+
+    if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
+        return 1;
+
+    switch ( operation )
+    {
+
+    case BLKIF_OP_READ:
+    case BLKIF_OP_WRITE:
+        gd = get_gendisk(device); 
+
+        /*
+         * Update the sector_number we'll pass down as appropriate; note that
+         * we could sanity check that resulting sector will be in this
+         * partition, but this will happen in driver backend anyhow.
+         */
+        sector_number += gd->part[MINOR(device)].start_sect;
+
+        /*
+         * If this unit doesn't consist of virtual partitions then we clear 
+         * the partn bits from the device number.
+         */
+        if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & 
+               GENHD_FL_VIRT_PARTNS) )
+            device &= ~(gd->max_p - 1);
+
+        if ( (sg_operation == operation) &&
+             (sg_dev == device) &&
+             (sg_next_sect == sector_number) )
+        {
+            req = &blk_ring->ring[MASK_BLK_IDX(req_prod-1)].req;
+            bh = (struct buffer_head *)id;
+            bh->b_reqnext = (struct buffer_head *)req->id;
+            req->id = id;
+            req->frame_and_sects[req->nr_segments] = 
+                buffer_ma | (fsect<<3) | lsect;
+            if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
+                sg_next_sect += nr_sectors;
+            else
+                DISABLE_SCATTERGATHER();
+
+            /* Update the copy of the request in the recovery ring. */
+            blk_ring_rec->ring[MASK_BLK_IDX(blk_ring_rec->req_prod - 1)].req
+                = *req;
+
+            return 0;
+        }
+        else if ( RING_PLUGGED )
+        {
+            return 1;
+        }
+        else
+        {
+            sg_operation = operation;
+            sg_dev       = device;
+            sg_next_sect = sector_number + nr_sectors;
+        }
+        break;
+
+    default:
+        panic("unknown op %d\n", operation);
+    }
+
+    /* Fill out a communications ring structure. */
+    req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
+    req->id            = id;
+    req->operation     = operation;
+    req->sector_number = (blkif_sector_t)sector_number;
+    req->device        = device; 
+    req->nr_segments   = 1;
+    req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
+    req_prod++;
+
+    /* Keep a private copy so we can reissue requests when recovering. */
+    blk_ring_rec->ring[MASK_BLK_IDX(blk_ring_rec->req_prod)].req = *req;
+    blk_ring_rec->req_prod++;
+
+    return 0;
+}
+
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+void do_blkif_request(request_queue_t *rq)
+{
+    struct request *req;
+    struct buffer_head *bh, *next_bh;
+    int rw, nsect, full, queued = 0;
+
+    DPRINTK("Entered do_blkif_request\n"); 
+
+    while ( !rq->plugged && !list_empty(&rq->queue_head))
+    {
+        if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) 
+            goto out;
+  
+        DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
+                req, req->cmd, req->sector,
+                req->current_nr_sectors, req->nr_sectors, req->bh);
+
+        rw = req->cmd;
+        if ( rw == READA )
+            rw = READ;
+        if ( unlikely((rw != READ) && (rw != WRITE)) )
+            panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
+
+        req->errors = 0;
+
+        bh = req->bh;
+        while ( bh != NULL )
+        {
+            next_bh = bh->b_reqnext;
+            bh->b_reqnext = NULL;
+
+            full = blkif_queue_request(
+                (unsigned long)bh,
+                (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, 
+                bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
+
+            if ( full )
+            { 
+                bh->b_reqnext = next_bh;
+                pending_queues[nr_pending++] = rq;
+                if ( unlikely(nr_pending >= MAX_PENDING) )
+                    BUG();
+                goto out; 
+            }
+
+            queued++;
+
+            /* Dequeue the buffer head from the request. */
+            nsect = bh->b_size >> 9;
+            bh = req->bh = next_bh;
+            
+            if ( bh != NULL )
+            {
+                /* There's another buffer head to do. Update the request. */
+                req->hard_sector += nsect;
+                req->hard_nr_sectors -= nsect;
+                req->sector = req->hard_sector;
+                req->nr_sectors = req->hard_nr_sectors;
+                req->current_nr_sectors = bh->b_size >> 9;
+                req->buffer = bh->b_data;
+            }
+            else
+            {
+                /* That was the last buffer head. Finalise the request. */
+                if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
+                    BUG();
+                blkdev_dequeue_request(req);
+                end_that_request_last(req);
+            }
+        }
+    }
+
+ out:
+    if ( queued != 0 )
+        flush_requests();
+}
+
+
+static void kick_pending_request_queues(void)
+{
+    /* We kick pending request queues if the ring is reasonably empty. */
+    if ( (nr_pending != 0) && 
+         ((req_prod - resp_cons) < (BLK_RING_SIZE >> 1)) )
+    {
+        /* Attempt to drain the queue, but bail if the ring becomes full. */
+        while ( (nr_pending != 0) && !RING_PLUGGED )
+            do_blkif_request(pending_queues[--nr_pending]);
+    }
+}
+
+
+static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    BLK_RING_IDX i; 
+    unsigned long flags; 
+    struct buffer_head *bh, *next_bh;
+    
+//    printk(KERN_ALERT "blkif_int\n");
+
+    spin_lock_irqsave(&io_request_lock, flags);     
+
+    if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) )
+    {
+        printk("Bailed out\n");
+        
+        spin_unlock_irqrestore(&io_request_lock, flags);
+        return;
+    }
+
+    for ( i = resp_cons; i != blk_ring->resp_prod; i++ )
+    {
+        blkif_response_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
+        switch ( bret->operation )
+        {
+        case BLKIF_OP_READ:
+        case BLKIF_OP_WRITE:
+            if ( unlikely(bret->status != BLKIF_RSP_OKAY) )
+                DPRINTK("Bad return from blkdev data request: %lx\n",
+                        bret->status);
+            for ( bh = (struct buffer_head *)bret->id; 
+                  bh != NULL; 
+                  bh = next_bh )
+            {
+                next_bh = bh->b_reqnext;
+                bh->b_reqnext = NULL;
+                bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY);
+            }
+            break;
+        case BLKIF_OP_PROBE:
+            memcpy(&blkif_control_rsp, bret, sizeof(*bret));
+            blkif_control_rsp_valid = 1;
+            break;
+        default:
+            BUG();
+        }
+    }
+    
+    resp_cons = i;
+    resp_cons_rec = i;
+
+    kick_pending_request_queues();
+
+    spin_unlock_irqrestore(&io_request_lock, flags);
+}
+
+
+void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
+{
+    unsigned long flags;
+
+ retry:
+    while ( (req_prod - resp_cons) == BLK_RING_SIZE )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    spin_lock_irqsave(&io_request_lock, flags);
+    if ( (req_prod - resp_cons) == BLK_RING_SIZE )
+    {
+        spin_unlock_irqrestore(&io_request_lock, flags);
+        goto retry;
+    }
+
+    DISABLE_SCATTERGATHER();
+    memcpy(&blk_ring->ring[MASK_BLK_IDX(req_prod)].req, req, sizeof(*req));
+    memcpy(&blk_ring_rec->ring[MASK_BLK_IDX(blk_ring_rec->req_prod++)].req,
+           req, sizeof(*req));
+    req_prod++;
+    flush_requests();
+
+    spin_unlock_irqrestore(&io_request_lock, flags);
+
+    while ( !blkif_control_rsp_valid )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    memcpy(rsp, &blkif_control_rsp, sizeof(*rsp));
+    blkif_control_rsp_valid = 0;
+}
+
+
+static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
+{
+    ctrl_msg_t                   cmsg;
+    blkif_fe_interface_connect_t up;
+
+    if ( status->handle != 0 )
+    {
+        printk(KERN_WARNING "Status change on unsupported blkif %d\n",
+               status->handle);
+        return;
+    }
+
+    switch ( status->status )
+    {
+    case BLKIF_INTERFACE_STATUS_DESTROYED:
+        printk(KERN_WARNING "Unexpected blkif-DESTROYED message in state %d\n",
+               blkif_state);
+        break;
+
+    case BLKIF_INTERFACE_STATUS_DISCONNECTED:
+        if ( blkif_state != BLKIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected blkif-DISCONNECTED message"
+                   " in state %d\n", blkif_state);
+
+            printk(KERN_INFO "VBD driver recovery in progress\n");
+            
+            /* Prevent new requests being issued until we've fixed things up. */
+            spin_lock_irq(&io_request_lock);
+            recovery = 1;
+            blkif_state = BLKIF_STATE_DISCONNECTED;
+            spin_unlock_irq(&io_request_lock);
+
+            /* Free resources associated with old device channel. */
+            free_page((unsigned long)blk_ring);
+            free_irq(blkif_irq, NULL);
+            unbind_evtchn_from_irq(blkif_evtchn);
+        }
+
+        /* Move from CLOSED to DISCONNECTED state. */
+        blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+        blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
+        blkif_state  = BLKIF_STATE_DISCONNECTED;
+
+        /* Construct an interface-CONNECT message for the domain controller. */
+        cmsg.type      = CMSG_BLKIF_FE;
+        cmsg.subtype   = CMSG_BLKIF_FE_INTERFACE_CONNECT;
+        cmsg.length    = sizeof(blkif_fe_interface_connect_t);
+        up.handle      = 0;
+        up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
+        memcpy(cmsg.msg, &up, sizeof(up));
+        
+        /* Tell the controller to bring up the interface. */
+        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+        break;
+
+    case BLKIF_INTERFACE_STATUS_CONNECTED:
+        if ( blkif_state == BLKIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected blkif-CONNECTED message"
+                   " in state %d\n", blkif_state);
+            break;
+        }
+
+        blkif_evtchn = status->evtchn;
+        blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
+        (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
+
+        if ( recovery )
+        {
+            int i;
+
+	    /* Shouldn't need the io_request_lock here - the device is
+	     * plugged and the recovery flag prevents the interrupt handler
+	     * changing anything. */
+
+            /* Reissue requests from the private block ring. */
+            for ( i = 0;
+		  resp_cons_rec < blk_ring_rec->req_prod;
+                  resp_cons_rec++, i++ )
+            {
+                blk_ring->ring[i].req
+                    = blk_ring_rec->ring[MASK_BLK_IDX(resp_cons_rec)].req;
+            }
+
+            /* Reset the private block ring to match the new ring. */
+            memcpy(blk_ring_rec, blk_ring, sizeof(*blk_ring));
+            resp_cons_rec = 0;
+
+            /* blk_ring->req_prod will be set when we flush_requests().*/
+            blk_ring_rec->req_prod = req_prod = i;
+
+            wmb();
+
+            /* Switch off recovery mode, using a memory barrier to ensure that
+             * it's seen before we flush requests - we don't want to miss any
+             * interrupts. */
+            recovery = 0;
+            wmb();
+
+            /* Kicks things back into life. */
+            flush_requests();
+        }
+        else
+        {
+            /* Probe for discs that are attached to the interface. */
+            xlvbd_init();
+        }
+
+        blkif_state = BLKIF_STATE_CONNECTED;
+        
+        /* Kick pending requests. */
+        spin_lock_irq(&io_request_lock);
+        kick_pending_request_queues();
+        spin_unlock_irq(&io_request_lock);
+
+        break;
+
+    default:
+        printk(KERN_WARNING "Status change to unknown value %d\n", 
+               status->status);
+        break;
+    }
+}
+
+
+static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->subtype )
+    {
+    case CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED:
+        if ( msg->length != sizeof(blkif_fe_interface_status_changed_t) )
+            goto parse_error;
+        blkif_status_change((blkif_fe_interface_status_changed_t *)
+                            &msg->msg[0]);
+        break;        
+#if 0
+    case CMSG_BLKIF_FE_VBD_STATUS_CHANGED:
+        update_tq.routine = update_vbds_task;
+        schedule_task(&update_tq);
+        break;
+#endif
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+
+int __init xlblk_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    blkif_fe_driver_status_changed_t st;
+
+    if ( (start_info.flags & SIF_INITDOMAIN) 
+        || (start_info.flags & SIF_BLK_BE_DOMAIN) )
+        return 0;
+
+    printk(KERN_INFO "Initialising Xen virtual block device\n");
+
+    blk_ring_rec = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+    memset(blk_ring_rec, 0, sizeof(*blk_ring_rec));
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_FE;
+    cmsg.subtype   = CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(blkif_fe_driver_status_changed_t);
+    st.status      = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+    /*
+     * We should read 'nr_interfaces' from response message and wait
+     * for notifications before proceeding. For now we assume that we
+     * will be notified of exactly one interface.
+     */
+    while ( blkif_state != BLKIF_STATE_CONNECTED )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    return 0;
+}
+
+
+static void __exit xlblk_cleanup(void)
+{
+    /* XXX FIXME */
+    BUG();
+}
+
+
+#ifdef MODULE
+module_init(xlblk_init);
+module_exit(xlblk_cleanup);
+#endif
+
+
+void blkdev_suspend(void)
+{
+    /* XXX FIXME */
+    BUG();
+}
+
+
+void blkdev_resume(void)
+{
+    /* XXX FIXME */
+    BUG();
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c
new file mode 100644
index 0000000000..12ce976cb5
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c
@@ -0,0 +1,561 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/frontend/vbd.c
+ * 
+ * Xenolinux virtual block-device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ */
+
+#include "common.h"
+#include <linux/blk.h>
+
+/*
+ * For convenience we distinguish between ide, scsi and 'other' (i.e.
+ * potentially combinations of the two) in the naming scheme and in a few 
+ * other places (like default readahead, etc).
+ */
+#define XLIDE_MAJOR_NAME  "hd"
+#define XLSCSI_MAJOR_NAME "sd"
+#define XLVBD_MAJOR_NAME "xvd"
+
+#define XLIDE_DEVS_PER_MAJOR   2
+#define XLSCSI_DEVS_PER_MAJOR 16
+#define XLVBD_DEVS_PER_MAJOR  16
+
+#define XLIDE_PARTN_SHIFT  6    /* amount to shift minor to get 'real' minor */
+#define XLIDE_MAX_PART    (1 << XLIDE_PARTN_SHIFT)     /* minors per ide vbd */
+
+#define XLSCSI_PARTN_SHIFT 4    /* amount to shift minor to get 'real' minor */
+#define XLSCSI_MAX_PART   (1 << XLSCSI_PARTN_SHIFT)   /* minors per scsi vbd */
+
+#define XLVBD_PARTN_SHIFT  4    /* amount to shift minor to get 'real' minor */
+#define XLVBD_MAX_PART    (1 << XLVBD_PARTN_SHIFT) /* minors per 'other' vbd */
+
+/* The below are for the generic drivers/block/ll_rw_block.c code. */
+static int xlide_blksize_size[256];
+static int xlide_hardsect_size[256];
+static int xlide_max_sectors[256];
+static int xlscsi_blksize_size[256];
+static int xlscsi_hardsect_size[256];
+static int xlscsi_max_sectors[256];
+static int xlvbd_blksize_size[256];
+static int xlvbd_hardsect_size[256];
+static int xlvbd_max_sectors[256];
+
+/* Information about our VBDs. */
+#define MAX_VBDS 64
+static int nr_vbds;
+static vdisk_t *vbd_info;
+
+static struct block_device_operations xlvbd_block_fops = 
+{
+    open:               blkif_open,
+    release:            blkif_release,
+    ioctl:              blkif_ioctl,
+    check_media_change: blkif_check,
+    revalidate:         blkif_revalidate,
+};
+
+static int xlvbd_get_vbd_info(vdisk_t *disk_info)
+{
+    vdisk_t         *buf = (vdisk_t *)__get_free_page(GFP_KERNEL);
+    blkif_request_t  req;
+    blkif_response_t rsp;
+    int              nr;
+
+    memset(&req, 0, sizeof(req));
+    req.operation   = BLKIF_OP_PROBE;
+    req.nr_segments = 1;
+    req.frame_and_sects[0] = virt_to_machine(buf) | 7;
+
+    blkif_control_send(&req, &rsp);
+
+    if ( rsp.status <= 0 )
+    {
+        printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status);
+        return -1;
+    }
+
+    if ( (nr = rsp.status) > MAX_VBDS )
+         nr = MAX_VBDS;
+    memcpy(disk_info, buf, nr * sizeof(vdisk_t));
+
+    return nr;
+}
+
+/*
+ * xlvbd_init_device - initialise a VBD device
+ * @disk:              a vdisk_t describing the VBD
+ *
+ * Takes a vdisk_t * that describes a VBD the domain has access to.
+ * Performs appropriate initialisation and registration of the device.
+ *
+ * Care needs to be taken when making re-entrant calls to ensure that
+ * corruption does not occur.  Also, devices that are in use should not have
+ * their details updated.  This is the caller's responsibility.
+ */
+static int xlvbd_init_device(vdisk_t *xd)
+{
+    int device = xd->device;
+    int major  = MAJOR(device); 
+    int minor  = MINOR(device);
+    int is_ide = IDE_DISK_MAJOR(major);  /* is this an ide device? */
+    int is_scsi= SCSI_BLK_MAJOR(major);  /* is this a scsi device? */
+    char *major_name;
+    struct gendisk *gd;
+    struct block_device *bd;
+    xl_disk_t *disk;
+    int i, rc = 0, max_part, partno;
+    unsigned long capacity;
+
+    unsigned char buf[64];
+
+    if ( (bd = bdget(device)) == NULL )
+        return -1;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((disk = xldev_to_xldisk(device)) != NULL) && (disk->usage != 0) )
+    {
+        printk(KERN_ALERT "VBD update failed - in use [dev=%x]\n", device);
+        rc = -1;
+        goto out;
+    }
+
+    if ( is_ide ) {
+
+	major_name = XLIDE_MAJOR_NAME; 
+	max_part   = XLIDE_MAX_PART;
+
+    } else if ( is_scsi ) {
+
+	major_name = XLSCSI_MAJOR_NAME;
+	max_part   = XLSCSI_MAX_PART;
+
+    } else if (XD_VIRTUAL(xd->info)) {
+
+	major_name = XLVBD_MAJOR_NAME;
+	max_part   = XLVBD_MAX_PART;
+
+    } else { 
+
+        /* SMH: hmm - probably a CCISS driver or sim; assume CCISS for now */
+	printk(KERN_ALERT "Assuming device %02x:%02x is CCISS/SCSI\n", 
+	       major, minor);
+	is_scsi    = 1; 
+	major_name = "cciss"; 
+	max_part   = XLSCSI_MAX_PART;
+
+    }
+    
+    partno = minor & (max_part - 1); 
+    
+    if ( (gd = get_gendisk(device)) == NULL )
+    {
+        rc = register_blkdev(major, major_name, &xlvbd_block_fops);
+        if ( rc < 0 )
+        {
+            printk(KERN_ALERT "XL VBD: can't get major %d\n", major);
+            goto out;
+        }
+
+        if ( is_ide )
+        { 
+            blksize_size[major]  = xlide_blksize_size;
+            hardsect_size[major] = xlide_hardsect_size;
+            max_sectors[major]   = xlide_max_sectors;
+            read_ahead[major]    = 8; /* from drivers/ide/ide-probe.c */
+        } 
+        else if ( is_scsi )
+        { 
+            blksize_size[major]  = xlscsi_blksize_size;
+            hardsect_size[major] = xlscsi_hardsect_size;
+            max_sectors[major]   = xlscsi_max_sectors;
+            read_ahead[major]    = 0; /* XXX 8; -- guessing */
+        }
+        else
+        { 
+            blksize_size[major]  = xlvbd_blksize_size;
+            hardsect_size[major] = xlvbd_hardsect_size;
+            max_sectors[major]   = xlvbd_max_sectors;
+            read_ahead[major]    = 8;
+        }
+
+        blk_init_queue(BLK_DEFAULT_QUEUE(major), do_blkif_request);
+
+        /*
+         * Turn off barking 'headactive' mode. We dequeue buffer heads as
+         * soon as we pass them to the back-end driver.
+         */
+        blk_queue_headactive(BLK_DEFAULT_QUEUE(major), 0);
+
+        /* Construct an appropriate gendisk structure. */
+        gd             = kmalloc(sizeof(struct gendisk), GFP_KERNEL);
+        gd->major      = major;
+        gd->major_name = major_name; 
+    
+        gd->max_p      = max_part; 
+        if ( is_ide )
+        { 
+            gd->minor_shift  = XLIDE_PARTN_SHIFT; 
+            gd->nr_real      = XLIDE_DEVS_PER_MAJOR; 
+        } 
+        else if ( is_scsi )
+        { 
+            gd->minor_shift  = XLSCSI_PARTN_SHIFT; 
+            gd->nr_real      = XLSCSI_DEVS_PER_MAJOR; 
+        }
+        else
+        { 
+            gd->minor_shift  = XLVBD_PARTN_SHIFT; 
+            gd->nr_real      = XLVBD_DEVS_PER_MAJOR; 
+        }
+
+        /* 
+        ** The sizes[] and part[] arrays hold the sizes and other 
+        ** information about every partition with this 'major' (i.e. 
+        ** every disk sharing the 8 bit prefix * max partns per disk) 
+        */
+        gd->sizes = kmalloc(max_part*gd->nr_real*sizeof(int), GFP_KERNEL);
+        gd->part  = kmalloc(max_part*gd->nr_real*sizeof(struct hd_struct), 
+                            GFP_KERNEL);
+        memset(gd->sizes, 0, max_part * gd->nr_real * sizeof(int));
+        memset(gd->part,  0, max_part * gd->nr_real 
+               * sizeof(struct hd_struct));
+
+
+        gd->real_devices = kmalloc(gd->nr_real * sizeof(xl_disk_t), 
+                                   GFP_KERNEL);
+        memset(gd->real_devices, 0, gd->nr_real * sizeof(xl_disk_t));
+
+        gd->next   = NULL;            
+        gd->fops   = &xlvbd_block_fops;
+
+        gd->de_arr = kmalloc(gd->nr_real * sizeof(*gd->de_arr), 
+                             GFP_KERNEL);
+        gd->flags  = kmalloc(gd->nr_real * sizeof(*gd->flags), GFP_KERNEL);
+    
+        memset(gd->de_arr, 0, gd->nr_real * sizeof(*gd->de_arr));
+        memset(gd->flags, 0, gd->nr_real *  sizeof(*gd->flags));
+
+        add_gendisk(gd);
+
+        blk_size[major] = gd->sizes;
+    }
+
+    if ( XD_READONLY(xd->info) )
+        set_device_ro(device, 1); 
+
+    gd->flags[minor >> gd->minor_shift] |= GENHD_FL_XEN;
+
+    /* NB. Linux 2.4 only handles 32-bit sector offsets and capacities. */
+    capacity = (unsigned long)xd->capacity;
+
+    if ( partno != 0 )
+    {
+        /*
+         * If this was previously set up as a real disc we will have set 
+         * up partition-table information. Virtual partitions override 
+         * 'real' partitions, and the two cannot coexist on a device.
+         */
+        if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) &&
+             (gd->sizes[minor & ~(max_part-1)] != 0) )
+        {
+            /*
+             * Any non-zero sub-partition entries must be cleaned out before
+             * installing 'virtual' partition entries. The two types cannot
+             * coexist, and virtual partitions are favoured.
+             */
+            kdev_t dev = device & ~(max_part-1);
+            for ( i = max_part - 1; i > 0; i-- )
+            {
+                invalidate_device(dev+i, 1);
+                gd->part[MINOR(dev+i)].start_sect = 0;
+                gd->part[MINOR(dev+i)].nr_sects   = 0;
+                gd->sizes[MINOR(dev+i)]           = 0;
+            }
+            printk(KERN_ALERT
+                   "Virtual partitions found for /dev/%s - ignoring any "
+                   "real partition information we may have found.\n",
+                   disk_name(gd, MINOR(device), buf));
+        }
+
+        /* Need to skankily setup 'partition' information */
+        gd->part[minor].start_sect = 0; 
+        gd->part[minor].nr_sects   = capacity; 
+        gd->sizes[minor]           = capacity; 
+
+        gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS;
+    }
+    else
+    {
+        gd->part[minor].nr_sects = capacity;
+        gd->sizes[minor] = capacity>>(BLOCK_SIZE_BITS-9);
+        
+        /* Some final fix-ups depending on the device type */
+        switch ( XD_TYPE(xd->info) )
+        { 
+        case XD_TYPE_CDROM:
+        case XD_TYPE_FLOPPY: 
+        case XD_TYPE_TAPE:
+            gd->flags[minor >> gd->minor_shift] |= GENHD_FL_REMOVABLE; 
+            printk(KERN_ALERT 
+                   "Skipping partition check on %s /dev/%s\n", 
+                   XD_TYPE(xd->info)==XD_TYPE_CDROM ? "cdrom" : 
+                   (XD_TYPE(xd->info)==XD_TYPE_TAPE ? "tape" : 
+                    "floppy"), disk_name(gd, MINOR(device), buf)); 
+            break; 
+
+        case XD_TYPE_DISK:
+            /* Only check partitions on real discs (not virtual!). */
+            if ( gd->flags[minor>>gd->minor_shift] & GENHD_FL_VIRT_PARTNS )
+            {
+                printk(KERN_ALERT
+                       "Skipping partition check on virtual /dev/%s\n",
+                       disk_name(gd, MINOR(device), buf));
+                break;
+            }
+            register_disk(gd, device, gd->max_p, &xlvbd_block_fops, capacity);
+            break; 
+
+        default:
+            printk(KERN_ALERT "XenoLinux: unknown device type %d\n", 
+                   XD_TYPE(xd->info)); 
+            break; 
+        }
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);    
+    return rc;
+}
+
+
+/*
+ * xlvbd_remove_device - remove a device node if possible
+ * @device:       numeric device ID
+ *
+ * Updates the gendisk structure and invalidates devices.
+ *
+ * This is OK for now but in future, should perhaps consider where this should
+ * deallocate gendisks / unregister devices.
+ */
+static int xlvbd_remove_device(int device)
+{
+    int i, rc = 0, minor = MINOR(device);
+    struct gendisk *gd;
+    struct block_device *bd;
+    xl_disk_t *disk = NULL;
+
+    if ( (bd = bdget(device)) == NULL )
+        return -1;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((gd = get_gendisk(device)) == NULL) ||
+         ((disk = xldev_to_xldisk(device)) == NULL) )
+        BUG();
+
+    if ( disk->usage != 0 )
+    {
+        printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device);
+        rc = -1;
+        goto out;
+    }
+ 
+    if ( (minor & (gd->max_p-1)) != 0 )
+    {
+        /* 1: The VBD is mapped to a partition rather than a whole unit. */
+        invalidate_device(device, 1);
+	gd->part[minor].start_sect = 0;
+        gd->part[minor].nr_sects   = 0;
+        gd->sizes[minor]           = 0;
+
+        /* Clear the consists-of-virtual-partitions flag if possible. */
+        gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS;
+        for ( i = 1; i < gd->max_p; i++ )
+            if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 )
+                gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS;
+
+        /*
+         * If all virtual partitions are now gone, and a 'whole unit' VBD is
+         * present, then we can try to grok the unit's real partition table.
+         */
+        if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) &&
+             (gd->sizes[minor & ~(gd->max_p-1)] != 0) &&
+             !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) )
+        {
+            register_disk(gd,
+                          device&~(gd->max_p-1), 
+                          gd->max_p, 
+                          &xlvbd_block_fops,
+                          gd->part[minor&~(gd->max_p-1)].nr_sects);
+        }
+    }
+    else
+    {
+        /*
+         * 2: The VBD is mapped to an entire 'unit'. Clear all partitions.
+         * NB. The partition entries are only cleared if there are no VBDs
+         * mapped to individual partitions on this unit.
+         */
+        i = gd->max_p - 1; /* Default: clear subpartitions as well. */
+        if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS )
+            i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */
+        while ( i >= 0 )
+        {
+            invalidate_device(device+i, 1);
+            gd->part[minor+i].start_sect = 0;
+            gd->part[minor+i].nr_sects   = 0;
+            gd->sizes[minor+i]           = 0;
+            i--;
+        }
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);
+    return rc;
+}
+
+/*
+ * xlvbd_update_vbds - reprobes the VBD status and performs updates driver
+ * state. The VBDs need to be updated in this way when the domain is
+ * initialised and also each time we receive an XLBLK_UPDATE event.
+ */
+void xlvbd_update_vbds(void)
+{
+    int i, j, k, old_nr, new_nr;
+    vdisk_t *old_info, *new_info, *merged_info;
+
+    old_info = vbd_info;
+    old_nr   = nr_vbds;
+
+    new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
+    if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 )
+    {
+        kfree(new_info);
+        return;
+    }
+
+    /*
+     * Final list maximum size is old list + new list. This occurs only when
+     * old list and new list do not overlap at all, and we cannot yet destroy
+     * VBDs in the old list because the usage counts are busy.
+     */
+    merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL);
+
+    /* @i tracks old list; @j tracks new list; @k tracks merged list. */
+    i = j = k = 0;
+
+    while ( (i < old_nr) && (j < new_nr) )
+    {
+        if ( old_info[i].device < new_info[j].device )
+        {
+            if ( xlvbd_remove_device(old_info[i].device) != 0 )
+                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
+            i++;
+        }
+        else if ( old_info[i].device > new_info[j].device )
+        {
+            if ( xlvbd_init_device(&new_info[j]) == 0 )
+                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
+            j++;
+        }
+        else
+        {
+            if ( ((old_info[i].capacity == new_info[j].capacity) &&
+                  (old_info[i].info == new_info[j].info)) ||
+                 (xlvbd_remove_device(old_info[i].device) != 0) )
+                memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
+            else if ( xlvbd_init_device(&new_info[j]) == 0 )
+                memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
+            i++; j++;
+        }
+    }
+
+    for ( ; i < old_nr; i++ )
+    {
+        if ( xlvbd_remove_device(old_info[i].device) != 0 )
+            memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t));
+    }
+
+    for ( ; j < new_nr; j++ )
+    {
+        if ( xlvbd_init_device(&new_info[j]) == 0 )
+            memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t));
+    }
+
+    vbd_info = merged_info;
+    nr_vbds  = k;
+
+    kfree(old_info);
+    kfree(new_info);
+}
+
+
+/*
+ * Set up all the linux device goop for the virtual block devices (vbd's) that
+ * we know about. Note that although from the backend driver's p.o.v. VBDs are
+ * addressed simply an opaque 16-bit device number, the domain creation tools 
+ * conventionally allocate these numbers to correspond to those used by 'real' 
+ * linux -- this is just for convenience as it means e.g. that the same 
+ * /etc/fstab can be used when booting with or without Xen.
+ */
+int xlvbd_init(void)
+{
+    int i;
+    
+    /*
+     * If compiled as a module, we don't support unloading yet. We therefore 
+     * permanently increment the reference count to disallow it.
+     */
+    SET_MODULE_OWNER(&xlvbd_block_fops);
+    MOD_INC_USE_COUNT;
+
+    /* Initialize the global arrays. */
+    for ( i = 0; i < 256; i++ ) 
+    {
+        /* from the generic ide code (drivers/ide/ide-probe.c, etc) */
+        xlide_blksize_size[i]  = 1024;
+        xlide_hardsect_size[i] = 512;
+        xlide_max_sectors[i]   = 128;  /* 'hwif->rqsize' if we knew it */
+
+        /* from the generic scsi disk code (drivers/scsi/sd.c) */
+        xlscsi_blksize_size[i]  = 1024; /* XXX 512; */
+        xlscsi_hardsect_size[i] = 512;
+        xlscsi_max_sectors[i]   = 128*8; /* XXX 128; */
+
+        /* we don't really know what to set these too since it depends */
+        xlvbd_blksize_size[i]  = 512;
+        xlvbd_hardsect_size[i] = 512;
+        xlvbd_max_sectors[i]   = 128;
+    }
+
+    vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL);
+    nr_vbds  = xlvbd_get_vbd_info(vbd_info);
+
+    if ( nr_vbds < 0 )
+    {
+        kfree(vbd_info);
+        vbd_info = NULL;
+        nr_vbds  = 0;
+    }
+    else
+    {
+        for ( i = 0; i < nr_vbds; i++ )
+            xlvbd_init_device(&vbd_info[i]);
+    }
+
+    return 0;
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/Makefile
new file mode 100644
index 0000000000..35986ca54a
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := block.o vbd.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.c
new file mode 100644
index 0000000000..43a6a23479
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.c
@@ -0,0 +1,625 @@
+/******************************************************************************
+ * block.c
+ * 
+ * Xenolinux virtual block-device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ */
+
+#include "block.h"
+#include <linux/blk.h>
+#include <linux/cdrom.h>
+#include <linux/tqueue.h>
+#include <linux/sched.h>
+#include <scsi/scsi.h>
+
+#include <linux/interrupt.h>
+
+typedef unsigned char byte; /* from linux/ide.h */
+
+#define STATE_ACTIVE    0
+#define STATE_SUSPENDED 1
+#define STATE_CLOSED    2
+static unsigned int state = STATE_SUSPENDED;
+
+/* Dynamically-mapped IRQs. */
+static int xlblk_response_irq, xlblk_update_irq;
+
+static blk_ring_t *blk_ring;
+static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */
+static BLK_RING_IDX req_prod;  /* Private request producer.         */
+
+/* We plug the I/O ring if the driver is suspended or if the ring is full. */
+#define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \
+                      (state != STATE_ACTIVE))
+
+
+/*
+ * Request queues with outstanding work, but ring is currently full.
+ * We need no special lock here, as we always access this with the
+ * io_request_lock held. We only need a small maximum list.
+ */
+#define MAX_PENDING 8
+static request_queue_t *pending_queues[MAX_PENDING];
+static int nr_pending;
+
+static kdev_t        sg_dev;
+static int           sg_operation = -1;
+static unsigned long sg_next_sect;
+#define DISABLE_SCATTERGATHER() (sg_operation = -1)
+
+static inline void signal_requests_to_xen(void)
+{
+    block_io_op_t op; 
+
+    DISABLE_SCATTERGATHER();
+    blk_ring->req_prod = req_prod;
+
+    op.cmd = BLOCK_IO_OP_SIGNAL; 
+    HYPERVISOR_block_io_op(&op);
+    return;
+}
+
+
+/*
+ * xlblk_update_int/update-vbds_task - handle VBD update events from Xen
+ * 
+ * Schedule a task for keventd to run, which will update the VBDs and perform 
+ * the corresponding updates to our view of VBD state, so the XenoLinux will 
+ * respond to changes / additions / deletions to the set of VBDs automatically.
+ */
+static struct tq_struct update_tq;
+static void update_vbds_task(void *unused)
+{ 
+    xlvbd_update_vbds();
+}
+static void xlblk_update_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    update_tq.routine = update_vbds_task;
+    schedule_task(&update_tq);
+}
+
+
+int xen_block_open(struct inode *inode, struct file *filep)
+{
+    short xldev = inode->i_rdev; 
+    struct gendisk *gd = get_gendisk(xldev);
+    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
+    short minor = MINOR(xldev); 
+
+    if ( gd->part[minor].nr_sects == 0 )
+    { 
+        /*
+         * Device either doesn't exist, or has zero capacity; we use a few
+         * cheesy heuristics to return the relevant error code
+         */
+        if ( (gd->sizes[minor >> gd->minor_shift] != 0) ||
+             ((minor & (gd->max_p - 1)) != 0) )
+        { 
+            /*
+             * We have a real device, but no such partition, or we just have a
+             * partition number so guess this is the problem.
+             */
+            return -ENXIO;     /* no such device or address */
+        }
+        else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE )
+        {
+            /* This is a removable device => assume that media is missing. */ 
+            return -ENOMEDIUM; /* media not present (this is a guess) */
+        } 
+        else
+        { 
+            /* Just go for the general 'no such device' error. */
+            return -ENODEV;    /* no such device */
+        }
+    }
+    
+    /* Update of usage count is protected by per-device semaphore. */
+    disk->usage++;
+
+    return 0;
+}
+
+
+int xen_block_release(struct inode *inode, struct file *filep)
+{
+    xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev);
+
+    /*
+     * When usage drops to zero it may allow more VBD updates to occur.
+     * Update of usage count is protected by a per-device semaphore.
+     */
+    if ( --disk->usage == 0 )
+    {
+        update_tq.routine = update_vbds_task;
+        schedule_task(&update_tq);
+    }
+
+    return 0;
+}
+
+
+int xen_block_ioctl(struct inode *inode, struct file *filep,
+                          unsigned command, unsigned long argument)
+{
+    kdev_t dev = inode->i_rdev;
+    struct hd_geometry *geo = (struct hd_geometry *)argument;
+    struct gendisk *gd;     
+    struct hd_struct *part; 
+    int i;
+
+    /* NB. No need to check permissions. That is done for us. */
+    
+    DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+                  command, (long) argument, dev); 
+  
+    gd = get_gendisk(dev);
+    part = &gd->part[MINOR(dev)]; 
+
+    switch ( command )
+    {
+    case BLKGETSIZE:
+        DPRINTK_IOCTL("   BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); 
+        return put_user(part->nr_sects, (unsigned long *) argument);
+
+    case BLKGETSIZE64:
+        DPRINTK_IOCTL("   BLKGETSIZE64: %x %llx\n", BLKGETSIZE64,
+                      (u64)part->nr_sects * 512);
+        return put_user((u64)part->nr_sects * 512, (u64 *) argument);
+
+    case BLKRRPART:                               /* re-read partition table */
+        DPRINTK_IOCTL("   BLKRRPART: %x\n", BLKRRPART);
+        return xen_block_revalidate(dev);
+
+    case BLKSSZGET:
+        return hardsect_size[MAJOR(dev)][MINOR(dev)]; 
+
+    case BLKBSZGET:                                        /* get block size */
+        DPRINTK_IOCTL("   BLKBSZGET: %x\n", BLKBSZGET);
+        break;
+
+    case BLKBSZSET:                                        /* set block size */
+        DPRINTK_IOCTL("   BLKBSZSET: %x\n", BLKBSZSET);
+        break;
+
+    case BLKRASET:                                         /* set read-ahead */
+        DPRINTK_IOCTL("   BLKRASET: %x\n", BLKRASET);
+        break;
+
+    case BLKRAGET:                                         /* get read-ahead */
+        DPRINTK_IOCTL("   BLKRAFET: %x\n", BLKRAGET);
+        break;
+
+    case HDIO_GETGEO:
+        /* note: these values are complete garbage */
+        DPRINTK_IOCTL("   HDIO_GETGEO: %x\n", HDIO_GETGEO);
+        if (!argument) return -EINVAL;
+        if (put_user(0x00,  (unsigned long *) &geo->start)) return -EFAULT;
+        if (put_user(0xff,  (byte *)&geo->heads)) return -EFAULT;
+        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+        if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT;
+        return 0;
+
+    case HDIO_GETGEO_BIG: 
+        /* note: these values are complete garbage */
+        DPRINTK_IOCTL("   HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
+        if (!argument) return -EINVAL;
+        if (put_user(0x00,  (unsigned long *) &geo->start))  return -EFAULT;
+        if (put_user(0xff,  (byte *)&geo->heads))   return -EFAULT;
+        if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+        if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT;
+        return 0;
+
+    case CDROMMULTISESSION:
+        DPRINTK("FIXME: support multisession CDs later\n");
+        for ( i = 0; i < sizeof(struct cdrom_multisession); i++ )
+            if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT;
+        return 0;
+
+    case SCSI_IOCTL_GET_BUS_NUMBER:
+        DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in Xen blkdev");
+        return -ENOSYS;
+
+    default:
+        printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", command);
+        return -ENOSYS;
+    }
+    
+    return 0;
+}
+
+/* check media change: should probably do something here in some cases :-) */
+int xen_block_check(kdev_t dev)
+{
+    DPRINTK("xen_block_check\n");
+    return 0;
+}
+
+int xen_block_revalidate(kdev_t dev)
+{
+    struct block_device *bd;
+    struct gendisk *gd;
+    xl_disk_t *disk;
+    unsigned long capacity;
+    int i, rc = 0;
+    
+    if ( (bd = bdget(dev)) == NULL )
+        return -EINVAL;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((gd = get_gendisk(dev)) == NULL) ||
+         ((disk = xldev_to_xldisk(dev)) == NULL) ||
+         ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) )
+    {
+        rc = -EINVAL;
+        goto out;
+    }
+
+    if ( disk->usage > 1 )
+    {
+        rc = -EBUSY;
+        goto out;
+    }
+
+    /* Only reread partition table if VBDs aren't mapped to partitions. */
+    if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) )
+    {
+        for ( i = gd->max_p - 1; i >= 0; i-- )
+        {
+            invalidate_device(dev+i, 1);
+            gd->part[MINOR(dev+i)].start_sect = 0;
+            gd->part[MINOR(dev+i)].nr_sects   = 0;
+            gd->sizes[MINOR(dev+i)]           = 0;
+        }
+
+        grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity);
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);
+    return rc;
+}
+
+
+/*
+ * hypervisor_request
+ *
+ * request block io 
+ * 
+ * id: for guest use only.
+ * operation: XEN_BLOCK_{READ,WRITE,PROBE,VBD*}
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ */
+static int hypervisor_request(unsigned long   id,
+                              int             operation,
+                              char *          buffer,
+                              unsigned long   sector_number,
+                              unsigned short  nr_sectors,
+                              kdev_t          device)
+{
+    unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); 
+    struct gendisk *gd;
+    blk_ring_req_entry_t *req;
+    struct buffer_head *bh;
+
+    if ( unlikely(nr_sectors >= (1<<9)) )
+        BUG();
+    if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
+        BUG();
+
+    if ( unlikely(state == STATE_CLOSED) )
+        return 1;
+
+    switch ( operation )
+    {
+
+    case XEN_BLOCK_READ:
+    case XEN_BLOCK_WRITE:
+        gd = get_gendisk(device); 
+
+        /*
+         * Update the sector_number we'll pass down as appropriate; note that
+         * we could sanity check that resulting sector will be in this
+         * partition, but this will happen in xen anyhow.
+         */
+        sector_number += gd->part[MINOR(device)].start_sect;
+
+        /*
+         * If this unit doesn't consist of virtual (i.e., Xen-specified)
+         * partitions then we clear the partn bits from the device number.
+         */
+        if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & 
+               GENHD_FL_VIRT_PARTNS) )
+            device &= ~(gd->max_p - 1);
+
+        if ( (sg_operation == operation) &&
+             (sg_dev == device) &&
+             (sg_next_sect == sector_number) )
+        {
+            req = &blk_ring->ring[MASK_BLK_IDX(req_prod-1)].req;
+            bh = (struct buffer_head *)id;
+            bh->b_reqnext = (struct buffer_head *)req->id;
+            req->id = id;
+            req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
+            if ( ++req->nr_segments < MAX_BLK_SEGS )
+                sg_next_sect += nr_sectors;
+            else
+                DISABLE_SCATTERGATHER();
+            return 0;
+        }
+        else if ( RING_PLUGGED )
+        {
+            return 1;
+        }
+        else
+        {
+            sg_operation = operation;
+            sg_dev       = device;
+            sg_next_sect = sector_number + nr_sectors;
+        }
+        break;
+
+    default:
+        panic("unknown op %d\n", operation);
+    }
+
+    /* Fill out a communications ring structure. */
+    req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req;
+    req->id            = id;
+    req->operation     = operation;
+    req->sector_number = (xen_sector_t)sector_number;
+    req->device        = device; 
+    req->nr_segments   = 1;
+    req->buffer_and_sects[0] = buffer_ma | nr_sectors;
+    req_prod++;
+
+    return 0;
+}
+
+
+/*
+ * do_xlblk_request
+ *  read a block; request is in a request queue
+ */
+void do_xlblk_request(request_queue_t *rq)
+{
+    struct request *req;
+    struct buffer_head *bh, *next_bh;
+    int rw, nsect, full, queued = 0;
+
+    DPRINTK("xlblk.c::do_xlblk_request\n"); 
+
+    while ( !rq->plugged && !list_empty(&rq->queue_head))
+    {
+        if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) 
+            goto out;
+  
+        DPRINTK("do_xlblk_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n",
+                req, req->cmd, req->sector,
+                req->current_nr_sectors, req->nr_sectors, req->bh);
+
+        rw = req->cmd;
+        if ( rw == READA )
+            rw = READ;
+        if ( unlikely((rw != READ) && (rw != WRITE)) )
+            panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
+
+        req->errors = 0;
+
+        bh = req->bh;
+        while ( bh != NULL )
+        {
+            next_bh = bh->b_reqnext;
+            bh->b_reqnext = NULL;
+
+            full = hypervisor_request(
+                (unsigned long)bh,
+                (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, 
+                bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
+
+            if ( full )
+            { 
+                bh->b_reqnext = next_bh;
+                pending_queues[nr_pending++] = rq;
+                if ( unlikely(nr_pending >= MAX_PENDING) )
+                    BUG();
+                goto out; 
+            }
+
+            queued++;
+
+            /* Dequeue the buffer head from the request. */
+            nsect = bh->b_size >> 9;
+            bh = req->bh = next_bh;
+            
+            if ( bh != NULL )
+            {
+                /* There's another buffer head to do. Update the request. */
+                req->hard_sector += nsect;
+                req->hard_nr_sectors -= nsect;
+                req->sector = req->hard_sector;
+                req->nr_sectors = req->hard_nr_sectors;
+                req->current_nr_sectors = bh->b_size >> 9;
+                req->buffer = bh->b_data;
+            }
+            else
+            {
+                /* That was the last buffer head. Finalise the request. */
+                if ( unlikely(end_that_request_first(req, 1, "XenBlk")) )
+                    BUG();
+                blkdev_dequeue_request(req);
+                end_that_request_last(req);
+            }
+        }
+    }
+
+ out:
+    if ( queued != 0 ) signal_requests_to_xen();
+}
+
+
+static void kick_pending_request_queues(void)
+{
+    /* We kick pending request queues if the ring is reasonably empty. */
+    if ( (nr_pending != 0) && 
+         ((req_prod - resp_cons) < (BLK_RING_SIZE >> 1)) )
+    {
+        /* Attempt to drain the queue, but bail if the ring becomes full. */
+        while ( (nr_pending != 0) && !RING_PLUGGED )
+            do_xlblk_request(pending_queues[--nr_pending]);
+    }
+}
+
+
+static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    BLK_RING_IDX i; 
+    unsigned long flags; 
+    struct buffer_head *bh, *next_bh;
+    
+    if ( unlikely(state == STATE_CLOSED) )
+        return;
+    
+    spin_lock_irqsave(&io_request_lock, flags);     
+
+    for ( i = resp_cons; i != blk_ring->resp_prod; i++ )
+    {
+        blk_ring_resp_entry_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp;
+        switch ( bret->operation )
+        {
+        case XEN_BLOCK_READ:
+        case XEN_BLOCK_WRITE:
+            if ( unlikely(bret->status != 0) )
+                DPRINTK("Bad return from blkdev data request: %lx\n",
+                        bret->status);
+            for ( bh = (struct buffer_head *)bret->id; 
+                  bh != NULL; 
+                  bh = next_bh )
+            {
+                next_bh = bh->b_reqnext;
+                bh->b_reqnext = NULL;
+                bh->b_end_io(bh, !bret->status);
+            }
+            break;
+     
+        default:
+            BUG();
+        }
+    }
+    
+    resp_cons = i;
+
+    kick_pending_request_queues();
+
+    spin_unlock_irqrestore(&io_request_lock, flags);
+}
+
+
+static void reset_xlblk_interface(void)
+{
+    block_io_op_t op; 
+
+    op.cmd = BLOCK_IO_OP_RESET;
+    if ( HYPERVISOR_block_io_op(&op) != 0 )
+        printk(KERN_ALERT "Possible blkdev trouble: couldn't reset ring\n");
+
+    op.cmd = BLOCK_IO_OP_RING_ADDRESS;
+    (void)HYPERVISOR_block_io_op(&op);
+
+    set_fixmap(FIX_BLKRING_BASE, op.u.ring_mfn << PAGE_SHIFT);
+    blk_ring = (blk_ring_t *)fix_to_virt(FIX_BLKRING_BASE);
+    blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
+
+    wmb();
+    state = STATE_ACTIVE;
+}
+
+
+int __init xlblk_init(void)
+{
+    int error; 
+
+    nr_pending = 0;
+
+    reset_xlblk_interface();
+
+    xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV);
+    xlblk_update_irq   = bind_virq_to_irq(VIRQ_VBD_UPD);
+
+    error = request_irq(xlblk_response_irq, xlblk_response_int, 
+                        SA_SAMPLE_RANDOM, "blkdev", NULL);
+    if ( error )
+    {
+        printk(KERN_ALERT "Could not allocate receive interrupt\n");
+        goto fail;
+    }
+
+    error = request_irq(xlblk_update_irq, xlblk_update_int,
+                        0, "blkdev", NULL);
+
+    if ( error )
+    {
+        printk(KERN_ALERT "Could not allocate block update interrupt\n");
+        goto fail;
+    }
+
+    (void)xlvbd_init();
+
+    return 0;
+
+ fail:
+    return error;
+}
+
+
+static void __exit xlblk_cleanup(void)
+{
+    xlvbd_cleanup();
+    free_irq(xlblk_response_irq, NULL);
+    free_irq(xlblk_update_irq, NULL);
+    unbind_virq_from_irq(VIRQ_BLKDEV);
+    unbind_virq_from_irq(VIRQ_VBD_UPD);
+}
+
+
+#ifdef MODULE
+module_init(xlblk_init);
+module_exit(xlblk_cleanup);
+#endif
+
+
+void blkdev_suspend(void)
+{
+    state = STATE_SUSPENDED;
+    wmb();
+
+    while ( resp_cons != blk_ring->req_prod )
+    {
+        barrier();
+        current->state = TASK_INTERRUPTIBLE;
+        schedule_timeout(1);
+    }
+
+    wmb();
+    state = STATE_CLOSED;
+    wmb();
+
+    clear_fixmap(FIX_BLKRING_BASE);
+}
+
+
+void blkdev_resume(void)
+{
+    reset_xlblk_interface();
+    spin_lock_irq(&io_request_lock);
+    kick_pending_request_queues();
+    spin_unlock_irq(&io_request_lock);
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.h
new file mode 100644
index 0000000000..e41e03970e
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.h
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * block.h
+ * 
+ * Shared definitions between all levels of XenoLinux Virtual block devices.
+ */
+
+#ifndef __XEN_DRIVERS_BLOCK_H__
+#define __XEN_DRIVERS_BLOCK_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+
+#include <asm/hypervisor-ifs/hypervisor-if.h>
+#include <asm/hypervisor-ifs/vbd.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#if 0
+#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a )
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+/* Private gendisk->flags[] values. */
+#define GENHD_FL_XEN        2 /* Is unit a Xen block device?  */
+#define GENHD_FL_VIRT_PARTNS 4 /* Are unit partitions virtual? */
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.
+ * They hang in an array off the gendisk structure. We may end up putting
+ * all kinds of interesting stuff here :-)
+ */
+typedef struct xl_disk {
+    int usage;
+} xl_disk_t;
+
+extern int xen_control_msg(int operration, char *buffer, int size);
+extern int xen_block_open(struct inode *inode, struct file *filep);
+extern int xen_block_release(struct inode *inode, struct file *filep);
+extern int xen_block_ioctl(struct inode *inode, struct file *filep,
+                                 unsigned command, unsigned long argument);
+extern int xen_block_check(kdev_t dev);
+extern int xen_block_revalidate(kdev_t dev);
+extern void do_xlblk_request (request_queue_t *rq); 
+
+extern void xlvbd_update_vbds(void);
+
+static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev)
+{
+    struct gendisk *gd = get_gendisk(xldev);
+    
+    if ( gd == NULL ) 
+        return NULL;
+    
+    return (xl_disk_t *)gd->real_devices + 
+        (MINOR(xldev) >> gd->minor_shift);
+}
+
+
+/* Virtual block-device subsystem. */
+extern int  xlvbd_init(void);
+extern void xlvbd_cleanup(void); 
+
+#endif /* __XEN_DRIVERS_BLOCK_H__ */
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/vbd.c
new file mode 100644
index 0000000000..e08b976c56
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/vbd.c
@@ -0,0 +1,561 @@
+/******************************************************************************
+ * vbd.c
+ * 
+ * Xenolinux virtual block-device driver (xvd).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ */
+
+#include "block.h"
+#include <linux/blk.h>
+
+/*
+ * For convenience we distinguish between ide, scsi and 'other' (i.e.
+ * potentially combinations of the two) in the naming scheme and in a few 
+ * other places (like default readahead, etc).
+ */
+#define XLIDE_MAJOR_NAME  "hd"
+#define XLSCSI_MAJOR_NAME "sd"
+#define XLVBD_MAJOR_NAME "xvd"
+
+#define XLIDE_DEVS_PER_MAJOR   2
+#define XLSCSI_DEVS_PER_MAJOR 16
+#define XLVBD_DEVS_PER_MAJOR  16
+
+#define XLIDE_PARTN_SHIFT  6    /* amount to shift minor to get 'real' minor */
+#define XLIDE_MAX_PART    (1 << XLIDE_PARTN_SHIFT)     /* minors per ide vbd */
+
+#define XLSCSI_PARTN_SHIFT 4    /* amount to shift minor to get 'real' minor */
+#define XLSCSI_MAX_PART   (1 << XLSCSI_PARTN_SHIFT)   /* minors per scsi vbd */
+
+#define XLVBD_PARTN_SHIFT  4    /* amount to shift minor to get 'real' minor */
+#define XLVBD_MAX_PART    (1 << XLVBD_PARTN_SHIFT) /* minors per 'other' vbd */
+
+/* The below are for the generic drivers/block/ll_rw_block.c code. */
+static int xlide_blksize_size[256];
+static int xlide_hardsect_size[256];
+static int xlide_max_sectors[256];
+static int xlscsi_blksize_size[256];
+static int xlscsi_hardsect_size[256];
+static int xlscsi_max_sectors[256];
+static int xlvbd_blksize_size[256];
+static int xlvbd_hardsect_size[256];
+static int xlvbd_max_sectors[256];
+
+/* Information from Xen about our VBDs. */
+#define MAX_VBDS 64
+static int nr_vbds;
+static xen_disk_t *vbd_info;
+
+static struct block_device_operations xlvbd_block_fops = 
+{
+    open:               xen_block_open,
+    release:            xen_block_release,
+    ioctl:              xen_block_ioctl,
+    check_media_change: xen_block_check,
+    revalidate:         xen_block_revalidate,
+};
+
+static int xlvbd_get_vbd_info(xen_disk_t *disk_info)
+{
+    int error;
+    block_io_op_t op; 
+
+    /* Probe for disk information. */
+    memset(&op, 0, sizeof(op)); 
+    op.cmd = BLOCK_IO_OP_VBD_PROBE; 
+    op.u.probe_params.domain    = 0; 
+    op.u.probe_params.xdi.max   = MAX_VBDS;
+    op.u.probe_params.xdi.disks = disk_info;
+    op.u.probe_params.xdi.count = 0;
+
+    if ( (error = HYPERVISOR_block_io_op(&op)) != 0 )
+    {
+        printk(KERN_ALERT "Could not probe disks (%d)\n", error);
+        return -1;
+    }
+
+    return op.u.probe_params.xdi.count;
+}
+
+/*
+ * xlvbd_init_device - initialise a VBD device
+ * @disk:              a xen_disk_t describing the VBD
+ *
+ * Takes a xen_disk_t * that describes a VBD the domain has access to.
+ * Performs appropriate initialisation and registration of the device.
+ *
+ * Care needs to be taken when making re-entrant calls to ensure that
+ * corruption does not occur.  Also, devices that are in use should not have
+ * their details updated.  This is the caller's responsibility.
+ */
+static int xlvbd_init_device(xen_disk_t *xd)
+{
+    int device = xd->device;
+    int major  = MAJOR(device); 
+    int minor  = MINOR(device);
+    int is_ide = IDE_DISK_MAJOR(major);  /* is this an ide device? */
+    int is_scsi= SCSI_BLK_MAJOR(major);  /* is this a scsi device? */
+    char *major_name;
+    struct gendisk *gd;
+    struct block_device *bd;
+    xl_disk_t *disk;
+    int i, rc = 0, max_part, partno;
+    unsigned long capacity;
+
+    unsigned char buf[64];
+
+    if ( (bd = bdget(device)) == NULL )
+        return -1;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((disk = xldev_to_xldisk(device)) != NULL) && (disk->usage != 0) )
+    {
+        printk(KERN_ALERT "VBD update failed - in use [dev=%x]\n", device);
+        rc = -1;
+        goto out;
+    }
+
+    if ( is_ide ) {
+
+	major_name = XLIDE_MAJOR_NAME; 
+	max_part   = XLIDE_MAX_PART;
+
+    } else if ( is_scsi ) {
+
+	major_name = XLSCSI_MAJOR_NAME;
+	max_part   = XLSCSI_MAX_PART;
+
+    } else if (XD_VIRTUAL(xd->info)) {
+
+	major_name = XLVBD_MAJOR_NAME;
+	max_part   = XLVBD_MAX_PART;
+
+    } else { 
+
+        /* SMH: hmm - probably a CCISS driver or sim; assume CCISS for now */
+	printk(KERN_ALERT "Assuming device %02x:%02x is CCISS/SCSI\n", 
+	       major, minor);
+	is_scsi    = 1; 
+	major_name = "cciss"; 
+	max_part   = XLSCSI_MAX_PART;
+
+    }
+    
+    partno = minor & (max_part - 1); 
+    
+    if ( (gd = get_gendisk(device)) == NULL )
+    {
+        rc = register_blkdev(major, major_name, &xlvbd_block_fops);
+        if ( rc < 0 )
+        {
+            printk(KERN_ALERT "XL VBD: can't get major %d\n", major);
+            goto out;
+        }
+
+        if ( is_ide )
+        { 
+            blksize_size[major]  = xlide_blksize_size;
+            hardsect_size[major] = xlide_hardsect_size;
+            max_sectors[major]   = xlide_max_sectors;
+            read_ahead[major]    = 8; /* from drivers/ide/ide-probe.c */
+        } 
+        else if ( is_scsi )
+        { 
+            blksize_size[major]  = xlscsi_blksize_size;
+            hardsect_size[major] = xlscsi_hardsect_size;
+            max_sectors[major]   = xlscsi_max_sectors;
+            read_ahead[major]    = 0; /* XXX 8; -- guessing */
+        }
+        else
+        { 
+            blksize_size[major]  = xlvbd_blksize_size;
+            hardsect_size[major] = xlvbd_hardsect_size;
+            max_sectors[major]   = xlvbd_max_sectors;
+            read_ahead[major]    = 8;
+        }
+
+        blk_init_queue(BLK_DEFAULT_QUEUE(major), do_xlblk_request);
+
+        /*
+         * Turn off barking 'headactive' mode. We dequeue buffer heads as
+         * soon as we pass them down to Xen.
+         */
+        blk_queue_headactive(BLK_DEFAULT_QUEUE(major), 0);
+
+        /* Construct an appropriate gendisk structure. */
+        gd             = kmalloc(sizeof(struct gendisk), GFP_KERNEL);
+        gd->major      = major;
+        gd->major_name = major_name; 
+    
+        gd->max_p      = max_part; 
+        if ( is_ide )
+        { 
+            gd->minor_shift  = XLIDE_PARTN_SHIFT; 
+            gd->nr_real      = XLIDE_DEVS_PER_MAJOR; 
+        } 
+        else if ( is_scsi )
+        { 
+            gd->minor_shift  = XLSCSI_PARTN_SHIFT; 
+            gd->nr_real      = XLSCSI_DEVS_PER_MAJOR; 
+        }
+        else
+        { 
+            gd->minor_shift  = XLVBD_PARTN_SHIFT; 
+            gd->nr_real      = XLVBD_DEVS_PER_MAJOR; 
+        }
+
+        /* 
+        ** The sizes[] and part[] arrays hold the sizes and other 
+        ** information about every partition with this 'major' (i.e. 
+        ** every disk sharing the 8 bit prefix * max partns per disk) 
+        */
+        gd->sizes = kmalloc(max_part*gd->nr_real*sizeof(int), GFP_KERNEL);
+        gd->part  = kmalloc(max_part*gd->nr_real*sizeof(struct hd_struct), 
+                            GFP_KERNEL);
+        memset(gd->sizes, 0, max_part * gd->nr_real * sizeof(int));
+        memset(gd->part,  0, max_part * gd->nr_real 
+               * sizeof(struct hd_struct));
+
+
+        gd->real_devices = kmalloc(gd->nr_real * sizeof(xl_disk_t), 
+                                   GFP_KERNEL);
+        memset(gd->real_devices, 0, gd->nr_real * sizeof(xl_disk_t));
+
+        gd->next   = NULL;            
+        gd->fops   = &xlvbd_block_fops;
+
+        gd->de_arr = kmalloc(gd->nr_real * sizeof(*gd->de_arr), 
+                             GFP_KERNEL);
+        gd->flags  = kmalloc(gd->nr_real * sizeof(*gd->flags), GFP_KERNEL);
+    
+        memset(gd->de_arr, 0, gd->nr_real * sizeof(*gd->de_arr));
+        memset(gd->flags, 0, gd->nr_real *  sizeof(*gd->flags));
+
+        add_gendisk(gd);
+
+        blk_size[major] = gd->sizes;
+    }
+
+    if ( XD_READONLY(xd->info) )
+        set_device_ro(device, 1); 
+
+    gd->flags[minor >> gd->minor_shift] |= GENHD_FL_XEN;
+
+    /* NB. Linux 2.4 only handles 32-bit sector offsets and capacities. */
+    capacity = (unsigned long)xd->capacity;
+
+    if ( partno != 0 )
+    {
+        /*
+         * If this was previously set up as a real disc we will have set 
+         * up partition-table information. Virtual partitions override 
+         * 'real' partitions, and the two cannot coexist on a device.
+         */
+        if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) &&
+             (gd->sizes[minor & ~(max_part-1)] != 0) )
+        {
+            /*
+             * Any non-zero sub-partition entries must be cleaned out before
+             * installing 'virtual' partition entries. The two types cannot
+             * coexist, and virtual partitions are favoured.
+             */
+            kdev_t dev = device & ~(max_part-1);
+            for ( i = max_part - 1; i > 0; i-- )
+            {
+                invalidate_device(dev+i, 1);
+                gd->part[MINOR(dev+i)].start_sect = 0;
+                gd->part[MINOR(dev+i)].nr_sects   = 0;
+                gd->sizes[MINOR(dev+i)]           = 0;
+            }
+            printk(KERN_ALERT
+                   "Virtual partitions found for /dev/%s - ignoring any "
+                   "real partition information we may have found.\n",
+                   disk_name(gd, MINOR(device), buf));
+        }
+
+        /* Need to skankily setup 'partition' information */
+        gd->part[minor].start_sect = 0; 
+        gd->part[minor].nr_sects   = capacity; 
+        gd->sizes[minor]           = capacity; 
+
+        gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS;
+    }
+    else
+    {
+        gd->part[minor].nr_sects = capacity;
+        gd->sizes[minor] = capacity>>(BLOCK_SIZE_BITS-9);
+        
+        /* Some final fix-ups depending on the device type */
+        switch ( XD_TYPE(xd->info) )
+        { 
+        case XD_TYPE_CDROM:
+        case XD_TYPE_FLOPPY: 
+        case XD_TYPE_TAPE:
+            gd->flags[minor >> gd->minor_shift] |= GENHD_FL_REMOVABLE; 
+            printk(KERN_ALERT 
+                   "Skipping partition check on %s /dev/%s\n", 
+                   XD_TYPE(xd->info)==XD_TYPE_CDROM ? "cdrom" : 
+                   (XD_TYPE(xd->info)==XD_TYPE_TAPE ? "tape" : 
+                    "floppy"), disk_name(gd, MINOR(device), buf)); 
+            break; 
+
+        case XD_TYPE_DISK:
+            /* Only check partitions on real discs (not virtual!). */
+            if ( gd->flags[minor>>gd->minor_shift] & GENHD_FL_VIRT_PARTNS )
+            {
+                printk(KERN_ALERT
+                       "Skipping partition check on virtual /dev/%s\n",
+                       disk_name(gd, MINOR(device), buf));
+                break;
+            }
+            register_disk(gd, device, gd->max_p, &xlvbd_block_fops, capacity);
+            break; 
+
+        default:
+            printk(KERN_ALERT "XenoLinux: unknown device type %d\n", 
+                   XD_TYPE(xd->info)); 
+            break; 
+        }
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);    
+    return rc;
+}
+
+
+/*
+ * xlvbd_remove_device - remove a device node if possible
+ * @device:       numeric device ID
+ *
+ * Updates the gendisk structure and invalidates devices.
+ *
+ * This is OK for now but in future, should perhaps consider where this should
+ * deallocate gendisks / unregister devices.
+ */
+static int xlvbd_remove_device(int device)
+{
+    int i, rc = 0, minor = MINOR(device);
+    struct gendisk *gd;
+    struct block_device *bd;
+    xl_disk_t *disk = NULL;
+
+    if ( (bd = bdget(device)) == NULL )
+        return -1;
+
+    /*
+     * Update of partition info, and check of usage count, is protected
+     * by the per-block-device semaphore.
+     */
+    down(&bd->bd_sem);
+
+    if ( ((gd = get_gendisk(device)) == NULL) ||
+         ((disk = xldev_to_xldisk(device)) == NULL) )
+        BUG();
+
+    if ( disk->usage != 0 )
+    {
+        printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device);
+        rc = -1;
+        goto out;
+    }
+ 
+    if ( (minor & (gd->max_p-1)) != 0 )
+    {
+        /* 1: The VBD is mapped to a partition rather than a whole unit. */
+        invalidate_device(device, 1);
+	gd->part[minor].start_sect = 0;
+        gd->part[minor].nr_sects   = 0;
+        gd->sizes[minor]           = 0;
+
+        /* Clear the consists-of-virtual-partitions flag if possible. */
+        gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS;
+        for ( i = 1; i < gd->max_p; i++ )
+            if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 )
+                gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS;
+
+        /*
+         * If all virtual partitions are now gone, and a 'whole unit' VBD is
+         * present, then we can try to grok the unit's real partition table.
+         */
+        if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) &&
+             (gd->sizes[minor & ~(gd->max_p-1)] != 0) &&
+             !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) )
+        {
+            register_disk(gd,
+                          device&~(gd->max_p-1), 
+                          gd->max_p, 
+                          &xlvbd_block_fops,
+                          gd->part[minor&~(gd->max_p-1)].nr_sects);
+        }
+    }
+    else
+    {
+        /*
+         * 2: The VBD is mapped to an entire 'unit'. Clear all partitions.
+         * NB. The partition entries are only cleared if there are no VBDs
+         * mapped to individual partitions on this unit.
+         */
+        i = gd->max_p - 1; /* Default: clear subpartitions as well. */
+        if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS )
+            i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */
+        while ( i >= 0 )
+        {
+            invalidate_device(device+i, 1);
+            gd->part[minor+i].start_sect = 0;
+            gd->part[minor+i].nr_sects   = 0;
+            gd->sizes[minor+i]           = 0;
+            i--;
+        }
+    }
+
+ out:
+    up(&bd->bd_sem);
+    bdput(bd);
+    return rc;
+}
+
+/*
+ * xlvbd_update_vbds - reprobes the VBD status and performs updates driver
+ * state. The VBDs need to be updated in this way when the domain is
+ * initialised and also each time we receive an XLBLK_UPDATE event.
+ */
+void xlvbd_update_vbds(void)
+{
+    int i, j, k, old_nr, new_nr;
+    xen_disk_t *old_info, *new_info, *merged_info;
+
+    old_info = vbd_info;
+    old_nr   = nr_vbds;
+
+    new_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL);
+    if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 )
+    {
+        kfree(new_info);
+        return;
+    }
+
+    /*
+     * Final list maximum size is old list + new list. This occurs only when
+     * old list and new list do not overlap at all, and we cannot yet destroy
+     * VBDs in the old list because the usage counts are busy.
+     */
+    merged_info = kmalloc((old_nr + new_nr) * sizeof(xen_disk_t), GFP_KERNEL);
+
+    /* @i tracks old list; @j tracks new list; @k tracks merged list. */
+    i = j = k = 0;
+
+    while ( (i < old_nr) && (j < new_nr) )
+    {
+        if ( old_info[i].device < new_info[j].device )
+        {
+            if ( xlvbd_remove_device(old_info[i].device) != 0 )
+                memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t));
+            i++;
+        }
+        else if ( old_info[i].device > new_info[j].device )
+        {
+            if ( xlvbd_init_device(&new_info[j]) == 0 )
+                memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t));
+            j++;
+        }
+        else
+        {
+            if ( ((old_info[i].capacity == new_info[j].capacity) &&
+                  (old_info[i].info == new_info[j].info)) ||
+                 (xlvbd_remove_device(old_info[i].device) != 0) )
+                memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t));
+            else if ( xlvbd_init_device(&new_info[j]) == 0 )
+                memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t));
+            i++; j++;
+        }
+    }
+
+    for ( ; i < old_nr; i++ )
+    {
+        if ( xlvbd_remove_device(old_info[i].device) != 0 )
+            memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t));
+    }
+
+    for ( ; j < new_nr; j++ )
+    {
+        if ( xlvbd_init_device(&new_info[j]) == 0 )
+            memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t));
+    }
+
+    vbd_info = merged_info;
+    nr_vbds  = k;
+
+    kfree(old_info);
+    kfree(new_info);
+}
+
+
+/*
+ * Set up all the linux device goop for the virtual block devices (vbd's) that 
+ * xen tells us about. Note that although from xen's pov VBDs are addressed 
+ * simply an opaque 16-bit device number, the domain creation tools 
+ * conventionally allocate these numbers to correspond to those used by 'real' 
+ * linux -- this is just for convenience as it means e.g. that the same 
+ * /etc/fstab can be used when booting with or without xen.
+ */
+int __init xlvbd_init(void)
+{
+    int i;
+    
+    /*
+     * If compiled as a module, we don't support unloading yet. We therefore 
+     * permanently increment the reference count to disallow it.
+     */
+    SET_MODULE_OWNER(&xlvbd_block_fops);
+    MOD_INC_USE_COUNT;
+
+    /* Initialize the global arrays. */
+    for ( i = 0; i < 256; i++ ) 
+    {
+        /* from the generic ide code (drivers/ide/ide-probe.c, etc) */
+        xlide_blksize_size[i]  = 1024;
+        xlide_hardsect_size[i] = 512;
+        xlide_max_sectors[i]   = 128;  /* 'hwif->rqsize' if we knew it */
+
+        /* from the generic scsi disk code (drivers/scsi/sd.c) */
+        xlscsi_blksize_size[i]  = 1024; /* XXX 512; */
+        xlscsi_hardsect_size[i] = 512;
+        xlscsi_max_sectors[i]   = 128*8; /* XXX 128; */
+
+        /* we don't really know what to set these too since it depends */
+        xlvbd_blksize_size[i]  = 512;
+        xlvbd_hardsect_size[i] = 512;
+        xlvbd_max_sectors[i]   = 128;
+    }
+
+    vbd_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL);
+    nr_vbds  = xlvbd_get_vbd_info(vbd_info);
+
+    if ( nr_vbds < 0 )
+    {
+        kfree(vbd_info);
+        vbd_info = NULL;
+        nr_vbds  = 0;
+    }
+    else
+    {
+        for ( i = 0; i < nr_vbds; i++ )
+            xlvbd_init_device(&vbd_info[i]);
+    }
+
+    return 0;
+}
+
+
+#ifdef MODULE
+module_init(xlvbd_init);
+#endif
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/console/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/Makefile
new file mode 100644
index 0000000000..aaa546a8f3
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-$(CONFIG_XEN_CONSOLE) := console.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/console/console.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/console.c
new file mode 100644
index 0000000000..91b97e83d9
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/console.c
@@ -0,0 +1,625 @@
+/******************************************************************************
+ * console.c
+ * 
+ * Virtual console driver.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <asm/evtchn.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/uaccess.h>
+#include <asm/hypervisor.h>
+#include <asm/hypervisor-ifs/event_channel.h>
+#include <asm/ctrl_if.h>
+
+/*
+ * Modes:
+ *  'xencons=off'  [XC_OFF]:     Console is disabled.
+ *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
+ *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
+ *                 [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY.
+ * 
+ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
+ * warnings from standard distro startup scripts.
+ */
+static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT;
+
+static int __init xencons_setup(char *str)
+{
+    if ( !strcmp(str, "tty") )
+        xc_mode = XC_TTY;
+    else if ( !strcmp(str, "ttyS") )
+        xc_mode = XC_SERIAL;
+    else if ( !strcmp(str, "off") )
+        xc_mode = XC_OFF;
+    return 1;
+}
+__setup("xencons", xencons_setup);
+
+/* The kernel and user-land drivers share a common transmit buffer. */
+#define WBUF_SIZE     4096
+#define WBUF_MASK(_i) ((_i)&(WBUF_SIZE-1))
+static char wbuf[WBUF_SIZE];
+static unsigned int wc, wp; /* write_cons, write_prod */
+
+/* This lock protects accesses to the common transmit buffer. */
+static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED;
+
+/* Common transmit-kick routine. */
+static void __xencons_tx_flush(void);
+
+/* This task is used to defer sending console data until there is space. */
+static void xencons_tx_flush_task_routine(void *data);
+static struct tq_struct xencons_tx_flush_task = {
+    routine: xencons_tx_flush_task_routine
+};
+
+
+/******************** Kernel console driver ********************************/
+
+static void kcons_write(
+    struct console *c, const char *s, unsigned int count)
+{
+    int           i;
+    unsigned long flags;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+    
+    for ( i = 0; i < count; i++ )
+    {
+        if ( (wp - wc) >= (WBUF_SIZE - 1) )
+            break;
+        if ( (wbuf[WBUF_MASK(wp++)] = s[i]) == '\n' )
+            wbuf[WBUF_MASK(wp++)] = '\r';
+    }
+
+    __xencons_tx_flush();
+
+    spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void kcons_write_dom0(
+    struct console *c, const char *s, unsigned int count)
+{
+    int rc;
+
+    while ( count > 0 )
+    {
+        if ( (rc = HYPERVISOR_console_io(CONSOLEIO_write, count, s)) > 0 )
+        {
+            count -= rc;
+            s += rc;
+        }
+	else
+	    break;
+    }
+}
+
+static kdev_t kcons_device(struct console *c)
+{
+    return MKDEV(TTY_MAJOR, (xc_mode == XC_SERIAL) ? 64 : 1);
+}
+
+static struct console kcons_info = {
+    device:  kcons_device,
+    flags:   CON_PRINTBUFFER,
+    index:   -1
+};
+
+void xen_console_init(void)
+{
+    if ( start_info.flags & SIF_INITDOMAIN )
+    {
+        if ( xc_mode == XC_DEFAULT )
+            xc_mode = XC_SERIAL;
+        kcons_info.write = kcons_write_dom0;
+    }
+    else
+    {
+        if ( xc_mode == XC_DEFAULT )
+            xc_mode = XC_TTY;
+        kcons_info.write = kcons_write;
+    }
+
+    if ( xc_mode == XC_OFF )
+        return;
+
+    if ( xc_mode == XC_SERIAL )
+        strcpy(kcons_info.name, "ttyS");
+    else
+        strcpy(kcons_info.name, "tty");
+
+    register_console(&kcons_info);
+}
+
+
+/*** Useful function for console debugging -- goes straight to Xen. ***/
+asmlinkage int xprintk(const char *fmt, ...)
+{
+    va_list args;
+    int printk_len;
+    static char printk_buf[1024];
+    
+    /* Emit the output into the temporary buffer */
+    va_start(args, fmt);
+    printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+    va_end(args);
+
+    /* Send the processed output directly to Xen. */
+    kcons_write_dom0(NULL, printk_buf, printk_len);
+
+    return 0;
+}
+
+/*** Forcibly flush console data before dying. ***/
+void xencons_force_flush(void)
+{
+    ctrl_msg_t msg;
+    int        sz;
+
+    /* Emergency console is synchronous, so there's nothing to flush. */
+    if ( start_info.flags & SIF_INITDOMAIN )
+        return;
+
+    /*
+     * We use dangerous control-interface functions that require a quiescent
+     * system and no interrupts. Try to ensure this with a global cli().
+     */
+    cli();
+
+    /* Spin until console data is flushed through to the domain controller. */
+    while ( (wc != wp) && !ctrl_if_transmitter_empty() )
+    {
+        /* Interrupts are disabled -- we must manually reap responses. */
+        ctrl_if_discard_responses();
+
+        if ( (sz = wp - wc) == 0 )
+            continue;
+        if ( sz > sizeof(msg.msg) )
+            sz = sizeof(msg.msg);
+        if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) )
+            sz = WBUF_SIZE - WBUF_MASK(wc);
+
+        msg.type    = CMSG_CONSOLE;
+        msg.subtype = CMSG_CONSOLE_DATA;
+        msg.length  = sz;
+        memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz);
+            
+        if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 )
+            wc += sz;
+    }
+}
+
+
+/******************** User-space console driver (/dev/console) ************/
+
+static struct tty_driver xencons_driver;
+static int xencons_refcount;
+static struct tty_struct *xencons_table[MAX_NR_CONSOLES];
+static struct termios *xencons_termios[MAX_NR_CONSOLES];
+static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
+static struct tty_struct *xencons_tty;
+static int xencons_priv_irq;
+static char x_char;
+
+/* Non-privileged receive callback. */
+static void xencons_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    int           i;
+    unsigned long flags;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+    if ( xencons_tty != NULL )
+    {
+        for ( i = 0; i < msg->length; i++ )
+            tty_insert_flip_char(xencons_tty, msg->msg[i], 0);
+        tty_flip_buffer_push(xencons_tty);
+    }
+    spin_unlock_irqrestore(&xencons_lock, flags);
+
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+/* Privileged and non-privileged transmit worker. */
+static void __xencons_tx_flush(void)
+{
+    int        sz, work_done = 0;
+    ctrl_msg_t msg;
+
+    if ( start_info.flags & SIF_INITDOMAIN )
+    {
+        if ( x_char )
+        {
+            kcons_write_dom0(NULL, &x_char, 1);
+            x_char = 0;
+            work_done = 1;
+        }
+
+        while ( wc != wp )
+        {
+            sz = wp - wc;
+            if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) )
+                sz = WBUF_SIZE - WBUF_MASK(wc);
+            kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
+            wc += sz;
+            work_done = 1;
+        }
+    }
+    else
+    {
+        while ( x_char )
+        {
+            msg.type    = CMSG_CONSOLE;
+            msg.subtype = CMSG_CONSOLE_DATA;
+            msg.length  = 1;
+            msg.msg[0]  = x_char;
+
+            if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 )
+                x_char = 0;
+            else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) )
+                break;
+
+            work_done = 1;
+        }
+
+        while ( wc != wp )
+        {
+            sz = wp - wc;
+            if ( sz > sizeof(msg.msg) )
+                sz = sizeof(msg.msg);
+            if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) )
+                sz = WBUF_SIZE - WBUF_MASK(wc);
+
+            msg.type    = CMSG_CONSOLE;
+            msg.subtype = CMSG_CONSOLE_DATA;
+            msg.length  = sz;
+            memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz);
+            
+            if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 )
+                wc += sz;
+            else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) )
+                break;
+
+            work_done = 1;
+        }
+    }
+
+    if ( work_done && (xencons_tty != NULL) )
+    {
+        wake_up_interruptible(&xencons_tty->write_wait);
+        if ( (xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) &&
+             (xencons_tty->ldisc.write_wakeup != NULL) )
+            (xencons_tty->ldisc.write_wakeup)(xencons_tty);
+    }
+}
+
+/* Non-privileged transmit kicker. */
+static void xencons_tx_flush_task_routine(void *data)
+{
+    unsigned long flags;
+    spin_lock_irqsave(&xencons_lock, flags);
+    __xencons_tx_flush();
+    spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+/* Privileged receive callback and transmit kicker. */
+static void xencons_priv_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+    static char   rbuf[16];
+    int           i, l;
+    unsigned long flags;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+
+    if ( xencons_tty != NULL )
+    {
+        /* Receive work. */
+        while ( (l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0 )
+            for ( i = 0; i < l; i++ )
+                tty_insert_flip_char(xencons_tty, rbuf[i], 0);
+        if ( xencons_tty->flip.count != 0 )
+            tty_flip_buffer_push(xencons_tty);
+    }
+
+    /* Transmit work. */
+    __xencons_tx_flush();
+
+    spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static int xencons_write_room(struct tty_struct *tty)
+{
+    return WBUF_SIZE - (wp - wc);
+}
+
+static int xencons_chars_in_buffer(struct tty_struct *tty)
+{
+    return wp - wc;
+}
+
+static void xencons_send_xchar(struct tty_struct *tty, char ch)
+{
+    unsigned long flags;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+    x_char = ch;
+    __xencons_tx_flush();
+    spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_throttle(struct tty_struct *tty)
+{
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    if ( I_IXOFF(tty) )
+        xencons_send_xchar(tty, STOP_CHAR(tty));
+}
+
+static void xencons_unthrottle(struct tty_struct *tty)
+{
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    if ( I_IXOFF(tty) )
+    {
+        if ( x_char != 0 )
+            x_char = 0;
+        else
+            xencons_send_xchar(tty, START_CHAR(tty));
+    }
+}
+
+static void xencons_flush_buffer(struct tty_struct *tty)
+{
+    unsigned long flags;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+    wc = wp = 0;
+    spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static inline int __xencons_put_char(int ch)
+{
+    char _ch = (char)ch;
+    if ( (wp - wc) == WBUF_SIZE )
+        return 0;
+    wbuf[WBUF_MASK(wp++)] = _ch;
+    return 1;
+}
+
+static int xencons_write(struct tty_struct *tty, int from_user,
+                       const u_char * buf, int count)
+{
+    int i;
+    unsigned long flags;
+
+    if ( from_user && verify_area(VERIFY_READ, buf, count) )
+        return -EINVAL;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return count;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+
+    for ( i = 0; i < count; i++ )
+    {
+        char ch;
+        if ( from_user )
+            __get_user(ch, buf + i);
+        else
+            ch = buf[i];
+        if ( !__xencons_put_char(ch) )
+            break;
+    }
+
+    if ( i != 0 )
+        __xencons_tx_flush();
+
+    spin_unlock_irqrestore(&xencons_lock, flags);
+
+    return i;
+}
+
+static void xencons_put_char(struct tty_struct *tty, u_char ch)
+{
+    unsigned long flags;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+    (void)__xencons_put_char(ch);
+    spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_flush_chars(struct tty_struct *tty)
+{
+    unsigned long flags;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    spin_lock_irqsave(&xencons_lock, flags);
+    __xencons_tx_flush();
+    spin_unlock_irqrestore(&xencons_lock, flags);    
+}
+
+static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+    unsigned long orig_jiffies = jiffies;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    while ( tty->driver.chars_in_buffer(tty) )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+        if ( signal_pending(current) )
+            break;
+        if ( (timeout != 0) && time_after(jiffies, orig_jiffies + timeout) )
+            break;
+    }
+    
+    set_current_state(TASK_RUNNING);
+}
+
+static int xencons_open(struct tty_struct *tty, struct file *filp)
+{
+    int line;
+    unsigned long flags;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return 0;
+
+    MOD_INC_USE_COUNT;
+    line = MINOR(tty->device) - tty->driver.minor_start;
+    if ( line != 0 )
+    {
+        MOD_DEC_USE_COUNT;
+        return -ENODEV;
+    }
+
+    spin_lock_irqsave(&xencons_lock, flags);
+    tty->driver_data = NULL;
+    if ( xencons_tty == NULL )
+        xencons_tty = tty;
+    __xencons_tx_flush();
+    spin_unlock_irqrestore(&xencons_lock, flags);    
+
+    return 0;
+}
+
+static void xencons_close(struct tty_struct *tty, struct file *filp)
+{
+    unsigned long flags;
+
+    if ( MINOR(tty->device) != xencons_driver.minor_start )
+        return;
+
+    if ( tty->count == 1 )
+    {
+        tty->closing = 1;
+        tty_wait_until_sent(tty, 0);
+        if ( tty->driver.flush_buffer != NULL )
+            tty->driver.flush_buffer(tty);
+        if ( tty->ldisc.flush_buffer != NULL )
+            tty->ldisc.flush_buffer(tty);
+        tty->closing = 0;
+        spin_lock_irqsave(&xencons_lock, flags);
+        xencons_tty = NULL;
+        spin_unlock_irqrestore(&xencons_lock, flags);    
+    }
+
+    MOD_DEC_USE_COUNT;
+}
+
+static int __init xencons_init(void)
+{
+    memset(&xencons_driver, 0, sizeof(struct tty_driver));
+    xencons_driver.magic           = TTY_DRIVER_MAGIC;
+    xencons_driver.major           = TTY_MAJOR;
+    xencons_driver.type            = TTY_DRIVER_TYPE_SERIAL;
+    xencons_driver.subtype         = SERIAL_TYPE_NORMAL;
+    xencons_driver.init_termios    = tty_std_termios;
+    xencons_driver.flags           = 
+        TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_NO_DEVFS;
+    xencons_driver.refcount        = &xencons_refcount;
+    xencons_driver.table           = xencons_table;
+    xencons_driver.termios         = xencons_termios;
+    xencons_driver.termios_locked  = xencons_termios_locked;
+
+    if ( xc_mode == XC_OFF )
+        return 0;
+
+    if ( xc_mode == XC_SERIAL )
+    {
+        xencons_driver.name        = "ttyS";
+        xencons_driver.minor_start = 64;
+        xencons_driver.num         = 1;
+    }
+    else
+    {
+        xencons_driver.name        = "tty";
+        xencons_driver.minor_start = 1;
+        xencons_driver.num         = MAX_NR_CONSOLES;
+    }
+
+    xencons_driver.open            = xencons_open;
+    xencons_driver.close           = xencons_close;
+    xencons_driver.write           = xencons_write;
+    xencons_driver.write_room      = xencons_write_room;
+    xencons_driver.put_char        = xencons_put_char;
+    xencons_driver.flush_chars     = xencons_flush_chars;
+    xencons_driver.chars_in_buffer = xencons_chars_in_buffer;
+    xencons_driver.send_xchar      = xencons_send_xchar;
+    xencons_driver.flush_buffer    = xencons_flush_buffer;
+    xencons_driver.throttle        = xencons_throttle;
+    xencons_driver.unthrottle      = xencons_unthrottle;
+    xencons_driver.wait_until_sent = xencons_wait_until_sent;
+
+    if ( tty_register_driver(&xencons_driver) )
+        panic("Couldn't register Xen virtual console driver\n");
+
+    if ( start_info.flags & SIF_INITDOMAIN )
+    {
+        xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE);
+        (void)request_irq(xencons_priv_irq,
+                          xencons_priv_interrupt, 0, "console", NULL);
+    }
+    else
+    {
+        (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0);
+    }
+
+    printk("Xen virtual console successfully installed\n");
+    
+    return 0;
+}
+
+static void __exit xencons_fini(void)
+{
+    int ret;
+
+    if ( (ret = tty_unregister_driver(&xencons_driver)) != 0 )
+        printk(KERN_ERR "Unable to unregister Xen console driver: %d\n", ret);
+
+    if ( start_info.flags & SIF_INITDOMAIN )
+    {
+        free_irq(xencons_priv_irq, NULL);
+        unbind_virq_from_irq(VIRQ_CONSOLE);
+    }
+    else
+    {
+        ctrl_if_unregister_receiver(CMSG_CONSOLE, xencons_rx);
+    }
+}
+
+module_init(xencons_init);
+module_exit(xencons_fini);
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/Makefile
new file mode 100644
index 0000000000..3e2e17bd23
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := core.o vfr.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/core.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/core.c
new file mode 100644
index 0000000000..5412a28572
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/core.c
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * core.c
+ * 
+ * Interface to privileged domain-0 commands.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/swapctl.h>
+#include <linux/iobuf.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/proc_cmd.h>
+#include <asm/hypervisor-ifs/dom0_ops.h>
+#include <asm/xen_proc.h>
+
+static struct proc_dir_entry *privcmd_intf;
+
+static int privcmd_ioctl(struct inode *inode, struct file *file,
+                         unsigned int cmd, unsigned long data)
+{
+    int ret = -ENOSYS;
+
+    switch ( cmd )
+    {
+    case IOCTL_PRIVCMD_HYPERCALL:
+    {
+        privcmd_hypercall_t hypercall;
+  
+        if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) )
+            return -EFAULT;
+
+        __asm__ __volatile__ (
+            "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; "
+            "movl  4(%%eax),%%ebx ;"
+            "movl  8(%%eax),%%ecx ;"
+            "movl 12(%%eax),%%edx ;"
+            "movl 16(%%eax),%%esi ;"
+            "movl 20(%%eax),%%edi ;"
+            "movl   (%%eax),%%eax ;"
+            TRAP_INSTR "; "
+            "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx"
+            : "=a" (ret) : "0" (&hypercall) : "memory" );
+
+    }
+    break;
+
+    case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN:
+    {
+        extern int initdom_ctrlif_domcontroller_port;
+        ret = initdom_ctrlif_domcontroller_port;
+    }
+    break;
+    
+
+    case IOCTL_PRIVCMD_MMAP:
+    {
+#define PRIVCMD_MMAP_SZ 32
+        privcmd_mmap_t mmapcmd;
+        privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ], *p;
+        int i, rc;
+
+        if ( copy_from_user(&mmapcmd, (void *)data, sizeof(mmapcmd)) )
+            return -EFAULT;
+
+        p = mmapcmd.entry;
+
+        for (i=0; i<mmapcmd.num; i+=PRIVCMD_MMAP_SZ, p+=PRIVCMD_MMAP_SZ)
+        {
+            int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)?
+                PRIVCMD_MMAP_SZ:(mmapcmd.num-i);
+            if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) )
+                return -EFAULT;
+     
+            for ( j = 0; j < n; j++ )
+            {
+                struct vm_area_struct *vma = 
+                    find_vma( current->mm, msg[j].va );
+
+                if ( !vma )
+                    return -EINVAL;
+
+                if ( msg[j].va > PAGE_OFFSET )
+                    return -EINVAL;
+
+                if ( (msg[j].va + (msg[j].npages<<PAGE_SHIFT)) > vma->vm_end )
+                    return -EINVAL;
+
+                if ( (rc = direct_remap_area_pages(vma->vm_mm, 
+                                                   msg[j].va&PAGE_MASK, 
+                                                   msg[j].mfn<<PAGE_SHIFT, 
+                                                   msg[j].npages<<PAGE_SHIFT, 
+                                                   vma->vm_page_prot,
+                                                   mmapcmd.dom)) < 0 )
+                    return rc;
+            }
+        }
+        ret = 0;
+    }
+    break;
+
+    case IOCTL_PRIVCMD_MMAPBATCH:
+    {
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+        mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v;
+        privcmd_mmapbatch_t m;
+        struct vm_area_struct *vma = NULL;
+        unsigned long *p, addr;
+        unsigned long mfn;
+        int i;
+
+        if ( copy_from_user(&m, (void *)data, sizeof(m)) )
+        { ret = -EFAULT; goto batch_err; }
+
+        vma = find_vma( current->mm, m.addr );
+
+        if ( !vma )
+        { ret = -EINVAL; goto batch_err; }
+
+        if ( m.addr > PAGE_OFFSET )
+        { ret = -EFAULT; goto batch_err; }
+
+        if ( (m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end )
+        { ret = -EFAULT; goto batch_err; }
+
+        if ( m.dom != 0 )
+        {
+            u[0].val  = (unsigned long)(m.dom<<16) & ~0xFFFFUL;
+            u[0].ptr  = (unsigned long)(m.dom<< 0) & ~0xFFFFUL;
+            u[0].ptr |= MMU_EXTENDED_COMMAND;
+            u[0].val |= MMUEXT_SET_SUBJECTDOM;
+            v = w = &u[1];
+        }
+        else
+        {
+            v = w = &u[0];
+        }
+
+        p = m.arr;
+        addr = m.addr;
+        for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ )
+        {
+            if ( get_user(mfn, p) ) return -EFAULT;
+
+            v->val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot) |
+                _PAGE_IO;
+
+            __direct_remap_area_pages(vma->vm_mm,
+                                      addr, 
+                                      PAGE_SIZE, 
+                                      v);
+
+            if ( unlikely(HYPERVISOR_mmu_update(u, v - u + 1, NULL) < 0) )
+                put_user( 0xF0000000 | mfn, p );
+
+            v = w;
+        }
+        ret = 0;
+        break;
+
+    batch_err:
+        printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n", 
+               ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end);
+        break;
+    }
+    break;
+
+    default:
+        ret = -EINVAL;
+        break;
+    }
+    return ret;
+}
+
+static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
+{
+    /* DONTCOPY is essential for Xen as copy_page_range is broken. */
+    vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
+
+    return 0;
+}
+
+static struct file_operations privcmd_file_ops = {
+    ioctl : privcmd_ioctl,
+    mmap:   privcmd_mmap
+};
+
+
+static int __init init_module(void)
+{
+    if ( !(start_info.flags & SIF_PRIVILEGED) )
+        return 0;
+
+    privcmd_intf = create_xen_proc_entry("privcmd", 0400);
+    if ( privcmd_intf != NULL )
+    {
+        privcmd_intf->owner      = THIS_MODULE;
+        privcmd_intf->nlink      = 1;
+        privcmd_intf->proc_fops  = &privcmd_file_ops;
+    }
+
+    return 0;
+}
+
+
+static void __exit cleanup_module(void)
+{
+    if ( privcmd_intf == NULL ) return;
+    remove_xen_proc_entry("privcmd");
+    privcmd_intf = NULL;
+}
+
+
+module_init(init_module);
+module_exit(cleanup_module);
+#
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/vfr.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/vfr.c
new file mode 100644
index 0000000000..9d8ca0a32d
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/vfr.c
@@ -0,0 +1,343 @@
+/******************************************************************************
+ * vfr.c
+ *
+ * Interface to the virtual firewall/router.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <asm/xen_proc.h>
+#include <asm/hypervisor-ifs/network.h>
+
+static struct proc_dir_entry *proc_vfr;
+
+static unsigned char readbuf[1024];
+
+/* Helpers, implemented at the bottom. */
+u32 getipaddr(const char *buff, unsigned int len);
+u16 antous(const char *buff, int len);
+u64 antoull(const char *buff, int len);
+int anton(const char *buff, int len);
+
+static int vfr_read_proc(char *page, char **start, off_t off,
+                         int count, int *eof, void *data)
+{   
+    strcpy(page, readbuf);
+    *readbuf = '\0';
+    *eof = 1;
+    *start = page;
+    return strlen(page);
+}
+
+/* The format for the vfr interface is as follows:
+ *
+ *  COMMAND <field>=<val> [<field>=<val> [...]]
+ *
+ *  where:
+ *
+ *  COMMAND = { ACCEPT | COUNT }
+ *
+ *  field=val pairs are as follows:
+ *
+ *  field = { srcaddr | dstaddr }
+ *      val is a dot seperated, numeric IP address.
+ *
+ *  field = { srcport | dstport }
+ *      val is a (16-bit) unsigned int
+ *
+ *  field = { proto }
+ *      val = { IP | TCP | UDP | ARP }
+ *
+ */
+
+#define isspace(_x) ( ((_x)==' ')  || ((_x)=='\t') || ((_x)=='\v') || \
+		      ((_x)=='\f') || ((_x)=='\r') || ((_x)=='\n') )
+
+static int vfr_write_proc(struct file *file, const char *buffer,
+                          u_long count, void *data)
+{
+    network_op_t op;
+    int ret, len;
+    int ts, te, tl; // token start, end, and length
+    int fs, fe, fl; // field.
+
+    len = count;
+    ts = te = 0;
+
+    memset(&op, 0, sizeof(network_op_t));
+
+    // get the command:
+    while ( count && isspace(buffer[ts]) ) { ts++; count--; } // skip spaces.
+    te = ts;
+    while ( count && !isspace(buffer[te]) ) { te++; count--; } // command end
+    if ( te <= ts ) goto bad;
+    tl = te - ts;
+  
+    if ( strncmp(&buffer[ts], "ADD", tl) == 0 )
+    {
+        op.cmd = NETWORK_OP_ADDRULE;
+    }
+    else if ( strncmp(&buffer[ts], "DELETE", tl) == 0 )
+    {
+        op.cmd = NETWORK_OP_DELETERULE;
+    }
+    else if ( strncmp(&buffer[ts], "PRINT", tl) == 0 )
+    {
+        op.cmd = NETWORK_OP_GETRULELIST;
+        goto doneparsing;
+    }
+        
+    ts = te;
+  
+    // get the action
+    while ( count && (buffer[ts] == ' ') ) { ts++; count--; } // skip spaces.
+    te = ts;
+    while ( count && (buffer[te] != ' ') ) { te++; count--; } // command end
+    if ( te <= ts ) goto bad;
+    tl = te - ts;
+
+    if ( strncmp(&buffer[ts], "ACCEPT", tl) == 0 ) 
+    {
+        op.u.net_rule.action = NETWORK_ACTION_ACCEPT;
+        goto keyval;
+    }
+    if ( strncmp(&buffer[ts], "COUNT", tl) == 0 ) 
+    {
+        op.u.net_rule.action = NETWORK_ACTION_COUNT;
+        goto keyval;
+    }
+   
+    // default case;
+    return (len);
+  
+
+    // get the key=val pairs.
+ keyval:
+    while (count)
+    {
+        //get field
+        ts = te; while ( count && isspace(buffer[ts]) ) { ts++; count--; }
+        te = ts;
+        while ( count && !isspace(buffer[te]) && (buffer[te] != '=') ) 
+        { te++; count--; }
+        if ( te <= ts )
+            goto doneparsing;
+        tl = te - ts;
+        fs = ts; fe = te; fl = tl; // save the field markers.
+        // skip "   =   " (ignores extra equals.)
+        while ( count && (isspace(buffer[te]) || (buffer[te] == '=')) ) 
+        { te++; count--; }
+        ts = te;
+        while ( count && !isspace(buffer[te]) ) { te++; count--; }
+        tl = te - ts;
+
+        if ( (fl <= 0) || (tl <= 0) ) goto bad;
+
+        /* NB. Prefix matches must go first! */
+        if (strncmp(&buffer[fs], "src", fl) == 0)
+        {
+            op.u.net_rule.src_dom = VIF_SPECIAL;
+            op.u.net_rule.src_idx = VIF_ANY_INTERFACE;
+        }
+        else if (strncmp(&buffer[fs], "dst", fl) == 0)
+        {
+            op.u.net_rule.dst_dom = VIF_SPECIAL;
+            op.u.net_rule.dst_idx = VIF_PHYSICAL_INTERFACE;
+        }
+        else if (strncmp(&buffer[fs], "srcaddr", fl) == 0) 
+        {  
+            op.u.net_rule.src_addr = getipaddr(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "dstaddr", fl) == 0)
+        {    
+            op.u.net_rule.dst_addr = getipaddr(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "srcaddrmask", fl) == 0) 
+        {
+            op.u.net_rule.src_addr_mask = getipaddr(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "dstaddrmask", fl) == 0)
+        {
+            op.u.net_rule.dst_addr_mask = getipaddr(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "srcport", fl) == 0)
+        {
+            op.u.net_rule.src_port = antous(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "dstport", fl) == 0)
+        {
+            op.u.net_rule.dst_port = antous(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "srcportmask", fl) == 0)
+        {
+            op.u.net_rule.src_port_mask = antous(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "dstportmask", fl) == 0)
+        {
+            op.u.net_rule.dst_port_mask = antous(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "srcdom", fl) == 0)
+        {
+            op.u.net_rule.src_dom = antoull(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "srcidx", fl) == 0)
+        {
+            op.u.net_rule.src_idx = anton(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "dstdom", fl) == 0)
+        {
+            op.u.net_rule.dst_dom = antoull(&buffer[ts], tl);
+        }
+        else if (strncmp(&buffer[fs], "dstidx", fl) == 0)
+        {
+            op.u.net_rule.dst_idx = anton(&buffer[ts], tl);
+        }
+        else if ( (strncmp(&buffer[fs], "proto", fl) == 0))
+        {	
+            if (strncmp(&buffer[ts], "any", tl) == 0) 
+                op.u.net_rule.proto = NETWORK_PROTO_ANY; 
+            if (strncmp(&buffer[ts], "ip", tl) == 0)
+                op.u.net_rule.proto = NETWORK_PROTO_IP;
+            if (strncmp(&buffer[ts], "tcp", tl) == 0) 
+                op.u.net_rule.proto = NETWORK_PROTO_TCP;
+            if (strncmp(&buffer[ts], "udp", tl) == 0)
+                op.u.net_rule.proto = NETWORK_PROTO_UDP;
+            if (strncmp(&buffer[ts], "arp", tl) == 0)
+                op.u.net_rule.proto = NETWORK_PROTO_ARP;
+        }
+    }
+
+ doneparsing:  
+    ret = HYPERVISOR_network_op(&op);
+    return(len);
+
+ bad:
+    return(len);
+    
+    
+}
+
+static int __init init_module(void)
+{
+    if ( !(start_info.flags & SIF_PRIVILEGED) )
+        return 0;
+
+    *readbuf = '\0';
+    proc_vfr = create_xen_proc_entry("vfr", 0600);
+    if ( proc_vfr != NULL )
+    {
+        proc_vfr->owner      = THIS_MODULE;
+        proc_vfr->nlink      = 1;
+        proc_vfr->read_proc  = vfr_read_proc;
+        proc_vfr->write_proc = vfr_write_proc;
+        printk("Successfully installed virtual firewall/router interface\n");
+    }
+    return 0;
+}
+
+static void __exit cleanup_module(void)
+{
+    if ( proc_vfr == NULL ) return;
+    remove_xen_proc_entry("vfr");
+    proc_vfr = NULL;
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
+
+/* Helper functions start here: */
+
+int anton(const char *buff, int len)
+{
+    int ret;
+    char c;
+    int sign = 1;
+    
+    ret = 0;
+
+    if (len == 0) return 0;
+    if (*buff == '-') { sign = -1; buff++; len--; }
+
+    while ( (len) && ((c = *buff) >= '0') && (c <= '9') )
+    {
+        ret *= 10;
+        ret += c - '0';
+        buff++; len--;
+    }
+
+    ret *= sign;
+    return ret;
+}
+    
+u16 antous(const char *buff, int len)
+{
+    u16 ret;
+    char c;
+
+    ret = 0;
+
+    while ( (len) && ((c = *buff) >= '0') && (c <= '9') )
+    {
+        ret *= 10;
+        ret += c - '0';
+        buff++; len--;
+    }
+
+    return ret;
+}
+
+u64 antoull(const char *buff, int len)
+{
+    u64 ret;
+    char c;
+
+    ret = 0;
+
+    while ( (len) && ((c = *buff) >= '0') && (c <= '9') )
+    {
+        ret *= 10;
+        ret += c - '0';
+        buff++; len--;
+    }
+
+    return ret;
+}
+
+u32 getipaddr(const char *buff, unsigned int len)
+{
+    char c;
+    u32 ret, val;
+
+    ret = 0; val = 0;
+
+    while ( len )
+    {
+        if (!((((c = *buff) >= '0') && ( c <= '9')) || ( c == '.' ) ) ) 
+        {
+            return(0); // malformed.
+        }
+
+        if ( c == '.' ) {
+            if (val > 255) return (0); //malformed.
+            ret = ret << 8; 
+            ret += val;
+            val = 0;
+            len--; buff++;
+            continue;
+        }
+        val *= 10;
+        val += c - '0';
+        buff++; len--;
+    }
+    ret = ret << 8;
+    ret += val;
+
+    return (ret);
+}
+
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/Makefile
new file mode 100644
index 0000000000..61c983f625
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := evtchn.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/evtchn.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/evtchn.c
new file mode 100644
index 0000000000..985d72821d
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/evtchn.c
@@ -0,0 +1,372 @@
+/******************************************************************************
+ * evtchn.c
+ * 
+ * Xenolinux driver for receiving and demuxing event-channel signals.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <asm/evtchn.h>
+
+/* NB. This must be shared amongst drivers if more things go in /dev/xen */
+static devfs_handle_t xen_dev_dir;
+
+/* Only one process may open /dev/xen/evtchn at any time. */
+static unsigned long evtchn_dev_inuse;
+
+/* Notification ring, accessed via /dev/xen/evtchn. */
+#define RING_SIZE     2048  /* 2048 16-bit entries */
+#define RING_MASK(_i) ((_i)&(RING_SIZE-1))
+static u16 *ring;
+static unsigned int ring_cons, ring_prod, ring_overflow;
+
+/* Processes wait on this queue via /dev/xen/evtchn when ring is empty. */
+static DECLARE_WAIT_QUEUE_HEAD(evtchn_wait);
+static struct fasync_struct *evtchn_async_queue;
+
+/* Which ports is user-space bound to? */
+static u32 bound_ports[32];
+
+static spinlock_t lock;
+
+void evtchn_device_upcall(int port)
+{
+    u16 port_subtype;
+    shared_info_t *s = HYPERVISOR_shared_info;
+
+    spin_lock(&lock);
+
+    mask_evtchn(port);
+    clear_evtchn(port);
+
+    if ( likely(!synch_test_and_clear_bit(port, &s->evtchn_exception[0])) )
+        port_subtype = PORT_NORMAL;
+    else
+        port_subtype = PORT_EXCEPTION;
+
+    if ( ring != NULL )
+    {
+        if ( (ring_prod - ring_cons) < RING_SIZE )
+        {
+            ring[RING_MASK(ring_prod)] = (u16)port | port_subtype;
+            if ( ring_cons == ring_prod++ )
+            {
+                wake_up_interruptible(&evtchn_wait);
+                kill_fasync(&evtchn_async_queue, SIGIO, POLL_IN);
+            }
+        }
+        else
+        {
+            ring_overflow = 1;
+        }
+    }
+
+    spin_unlock(&lock);
+}
+
+static void __evtchn_reset_buffer_ring(void)
+{
+    /* Initialise the ring to empty. Clear errors. */
+    ring_cons = ring_prod = ring_overflow = 0;
+}
+
+static ssize_t evtchn_read(struct file *file, char *buf,
+                           size_t count, loff_t *ppos)
+{
+    int rc;
+    unsigned int c, p, bytes1 = 0, bytes2 = 0;
+    DECLARE_WAITQUEUE(wait, current);
+
+    add_wait_queue(&evtchn_wait, &wait);
+
+    count &= ~1; /* even number of bytes */
+
+    if ( count == 0 )
+    {
+        rc = 0;
+        goto out;
+    }
+
+    if ( count > PAGE_SIZE )
+        count = PAGE_SIZE;
+
+    for ( ; ; )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+
+        if ( (c = ring_cons) != (p = ring_prod) )
+            break;
+
+        if ( ring_overflow )
+        {
+            rc = -EFBIG;
+            goto out;
+        }
+
+        if ( file->f_flags & O_NONBLOCK )
+        {
+            rc = -EAGAIN;
+            goto out;
+        }
+
+        if ( signal_pending(current) )
+        {
+            rc = -ERESTARTSYS;
+            goto out;
+        }
+
+        schedule();
+    }
+
+    /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+    if ( ((c ^ p) & RING_SIZE) != 0 )
+    {
+        bytes1 = (RING_SIZE - RING_MASK(c)) * sizeof(u16);
+        bytes2 = RING_MASK(p) * sizeof(u16);
+    }
+    else
+    {
+        bytes1 = (p - c) * sizeof(u16);
+        bytes2 = 0;
+    }
+
+    /* Truncate chunks according to caller's maximum byte count. */
+    if ( bytes1 > count )
+    {
+        bytes1 = count;
+        bytes2 = 0;
+    }
+    else if ( (bytes1 + bytes2) > count )
+    {
+        bytes2 = count - bytes1;
+    }
+
+    if ( copy_to_user(buf, &ring[RING_MASK(c)], bytes1) ||
+         ((bytes2 != 0) && copy_to_user(&buf[bytes1], &ring[0], bytes2)) )
+    {
+        rc = -EFAULT;
+        goto out;
+    }
+
+    ring_cons += (bytes1 + bytes2) / sizeof(u16);
+
+    rc = bytes1 + bytes2;
+
+ out:
+    __set_current_state(TASK_RUNNING);
+    remove_wait_queue(&evtchn_wait, &wait);
+    return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char *buf,
+                            size_t count, loff_t *ppos)
+{
+    int  rc, i;
+    u16 *kbuf = (u16 *)get_free_page(GFP_KERNEL);
+
+    if ( kbuf == NULL )
+        return -ENOMEM;
+
+    count &= ~1; /* even number of bytes */
+
+    if ( count == 0 )
+    {
+        rc = 0;
+        goto out;
+    }
+
+    if ( count > PAGE_SIZE )
+        count = PAGE_SIZE;
+
+    if ( copy_from_user(kbuf, buf, count) != 0 )
+    {
+        rc = -EFAULT;
+        goto out;
+    }
+
+    spin_lock_irq(&lock);
+    for ( i = 0; i < (count/2); i++ )
+        if ( test_bit(kbuf[i], &bound_ports[0]) )
+            unmask_evtchn(kbuf[i]);
+    spin_unlock_irq(&lock);
+
+    rc = count;
+
+ out:
+    free_page((unsigned long)kbuf);
+    return rc;
+}
+
+static int evtchn_ioctl(struct inode *inode, struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+    int rc = 0;
+    
+    spin_lock_irq(&lock);
+    
+    switch ( cmd )
+    {
+    case EVTCHN_RESET:
+        __evtchn_reset_buffer_ring();
+        break;
+    case EVTCHN_BIND:
+        if ( !test_and_set_bit(arg, &bound_ports[0]) )
+            unmask_evtchn(arg);
+        else
+            rc = -EINVAL;
+        break;
+    case EVTCHN_UNBIND:
+        if ( test_and_clear_bit(arg, &bound_ports[0]) )
+            mask_evtchn(arg);
+        else
+            rc = -EINVAL;
+        break;
+    default:
+        rc = -ENOSYS;
+        break;
+    }
+
+    spin_unlock_irq(&lock);   
+
+    return rc;
+}
+
+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
+{
+    unsigned int mask = POLLOUT | POLLWRNORM;
+    poll_wait(file, &evtchn_wait, wait);
+    if ( ring_cons != ring_prod )
+        mask |= POLLIN | POLLRDNORM;
+    if ( ring_overflow )
+        mask = POLLERR;
+    return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+    return fasync_helper(fd, filp, on, &evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+    u16 *_ring;
+
+    if ( test_and_set_bit(0, &evtchn_dev_inuse) )
+        return -EBUSY;
+
+    /* Allocate outside locked region so that we can use GFP_KERNEL. */
+    if ( (_ring = (u16 *)get_free_page(GFP_KERNEL)) == NULL )
+        return -ENOMEM;
+
+    spin_lock_irq(&lock);
+    ring = _ring;
+    __evtchn_reset_buffer_ring();
+    spin_unlock_irq(&lock);
+
+    MOD_INC_USE_COUNT;
+
+    return 0;
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+    int i;
+
+    spin_lock_irq(&lock);
+    if ( ring != NULL )
+    {
+        free_page((unsigned long)ring);
+        ring = NULL;
+    }
+    for ( i = 0; i < NR_EVENT_CHANNELS; i++ )
+        if ( test_and_clear_bit(i, &bound_ports[0]) )
+            mask_evtchn(i);
+    spin_unlock_irq(&lock);
+
+    evtchn_dev_inuse = 0;
+
+    MOD_DEC_USE_COUNT;
+
+    return 0;
+}
+
+static struct file_operations evtchn_fops = {
+    owner:    THIS_MODULE,
+    read:     evtchn_read,
+    write:    evtchn_write,
+    ioctl:    evtchn_ioctl,
+    poll:     evtchn_poll,
+    fasync:   evtchn_fasync,
+    open:     evtchn_open,
+    release:  evtchn_release
+};
+
+static struct miscdevice evtchn_miscdev = {
+    minor:    EVTCHN_MINOR,
+    name:     "evtchn",
+    fops:     &evtchn_fops
+};
+
+static int __init init_module(void)
+{
+    devfs_handle_t symlink_handle;
+    int            err, pos;
+    char           link_dest[64];
+
+    /* (DEVFS) create '/dev/misc/evtchn'. */
+    err = misc_register(&evtchn_miscdev);
+    if ( err != 0 )
+    {
+        printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+        return err;
+    }
+
+    /* (DEVFS) create directory '/dev/xen'. */
+    xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL);
+
+    /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */
+    pos = devfs_generate_path(evtchn_miscdev.devfs_handle, 
+                              &link_dest[3], 
+                              sizeof(link_dest) - 3);
+    if ( pos >= 0 )
+        strncpy(&link_dest[pos], "../", 3);
+
+    /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */
+    (void)devfs_mk_symlink(xen_dev_dir, 
+                           "evtchn", 
+                           DEVFS_FL_DEFAULT, 
+                           &link_dest[pos],
+                           &symlink_handle, 
+                           NULL);
+
+    /* (DEVFS) automatically destroy the symlink with its destination. */
+    devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle);
+
+    printk("Event-channel device installed.\n");
+
+    return 0;
+}
+
+static void cleanup_module(void)
+{
+    misc_deregister(&evtchn_miscdev);
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/Makefile
new file mode 100644
index 0000000000..20c8192d3d
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/Makefile
@@ -0,0 +1,10 @@
+
+O_TARGET := drv.o
+
+subdir-y += frontend
+obj-y    += frontend/drv.o
+
+subdir-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend
+obj-$(CONFIG_XEN_PHYSDEV_ACCESS)    += backend/drv.o
+
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/Makefile
new file mode 100644
index 0000000000..9ffb0bd702
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := main.o control.o interface.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/common.h
new file mode 100644
index 0000000000..88881cdf66
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/common.h
@@ -0,0 +1,96 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <asm/ctrl_if.h>
+#include <asm/io.h>
+#include "../netif.h"
+#include "../../../../../net/bridge/br_private.h"
+
+#ifndef NDEBUG
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct netif_st {
+    /* Unique identifier for this interface. */
+    domid_t          domid;
+    unsigned int     handle;
+
+    /* Physical parameters of the comms window. */
+    unsigned long    tx_shmem_frame;
+    unsigned long    rx_shmem_frame;
+    unsigned int     evtchn;
+    int              irq;
+
+    /* The shared rings and indexes. */
+    netif_tx_interface_t *tx;
+    netif_rx_interface_t *rx;
+
+    /* Private indexes into shared ring. */
+    NETIF_RING_IDX rx_req_cons;
+    NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */
+    NETIF_RING_IDX tx_req_cons;
+    NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */
+
+    /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+    unsigned long   credit_bytes;
+    unsigned long   credit_usec;
+    unsigned long   remaining_credit;
+    struct timer_list credit_timeout;
+
+    /* Miscellaneous private stuff. */
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    /*
+     * DISCONNECT response is deferred until pending requests are ack'ed.
+     * We therefore need to store the id from the original request.
+     */
+    u8               disconnect_rspid;
+    struct netif_st *hash_next;
+    struct list_head list;  /* scheduling list */
+    atomic_t         refcnt;
+    spinlock_t       rx_lock, tx_lock;
+    struct net_device *dev;
+    struct net_device_stats stats;
+} netif_t;
+
+void netif_create(netif_be_create_t *create);
+void netif_destroy(netif_be_destroy_t *destroy);
+void netif_connect(netif_be_connect_t *connect);
+int  netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id);
+void __netif_disconnect_complete(netif_t *netif);
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle);
+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define netif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            __netif_disconnect_complete(_b);      \
+    } while (0)
+
+void netif_interface_init(void);
+void netif_ctrlif_init(void);
+
+void netif_deschedule(netif_t *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
+void netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/control.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/control.c
new file mode 100644
index 0000000000..cf1b075031
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/control.c
@@ -0,0 +1,65 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/control.c
+ * 
+ * Routines for interfacing with the control plane.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->subtype )
+    {
+    case CMSG_NETIF_BE_CREATE:
+        if ( msg->length != sizeof(netif_be_create_t) )
+            goto parse_error;
+        netif_create((netif_be_create_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_DESTROY:
+        if ( msg->length != sizeof(netif_be_destroy_t) )
+            goto parse_error;
+        netif_destroy((netif_be_destroy_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_CONNECT:
+        if ( msg->length != sizeof(netif_be_connect_t) )
+            goto parse_error;
+        netif_connect((netif_be_connect_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_DISCONNECT:
+        if ( msg->length != sizeof(netif_be_disconnect_t) )
+            goto parse_error;
+        if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) )
+            return; /* Sending the response is deferred until later. */
+        break;        
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    DPRINTK("Parse error while reading message subtype %d, len %d\n",
+            msg->subtype, msg->length);
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+void netif_ctrlif_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    netif_be_driver_status_changed_t st;
+
+    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_NETIF_BE;
+    cmsg.subtype   = CMSG_NETIF_BE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(netif_be_driver_status_changed_t);
+    st.status      = NETIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/interface.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/interface.c
new file mode 100644
index 0000000000..5a2da3d29b
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/interface.c
@@ -0,0 +1,304 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ * 
+ * Network-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <linux/rtnetlink.h>
+
+#define NETIF_HASHSZ 1024
+#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1))
+
+static netif_t *netif_hash[NETIF_HASHSZ];
+static struct net_device *bridge_dev;
+static struct net_bridge *bridge_br;
+
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    netif_t *netif = netif_hash[NETIF_HASH(domid, handle)];
+    while ( (netif != NULL) && 
+            ((netif->domid != domid) || (netif->handle != handle)) )
+        netif = netif->hash_next;
+    return netif;
+}
+
+void __netif_disconnect_complete(netif_t *netif)
+{
+    ctrl_msg_t            cmsg;
+    netif_be_disconnect_t disc;
+
+    /*
+     * These can't be done in __netif_disconnect() because at that point there
+     * may be outstanding requests at the disc whose asynchronous responses
+     * must still be notified to the remote driver.
+     */
+    unbind_evtchn_from_irq(netif->evtchn);
+    vfree(netif->tx); /* Frees netif->rx as well. */
+    rtnl_lock();
+    (void)br_del_if(bridge_br, netif->dev);
+    (void)dev_close(netif->dev);
+    rtnl_unlock();
+
+    /* Construct the deferred response message. */
+    cmsg.type         = CMSG_NETIF_BE;
+    cmsg.subtype      = CMSG_NETIF_BE_DISCONNECT;
+    cmsg.id           = netif->disconnect_rspid;
+    cmsg.length       = sizeof(netif_be_disconnect_t);
+    disc.domid        = netif->domid;
+    disc.netif_handle = netif->handle;
+    disc.status       = NETIF_BE_STATUS_OKAY;
+    memcpy(cmsg.msg, &disc, sizeof(disc));
+
+    /*
+     * Make sure message is constructed /before/ status change, because
+     * after the status change the 'netif' structure could be deallocated at
+     * any time. Also make sure we send the response /after/ status change,
+     * as otherwise a subsequent CONNECT request could spuriously fail if
+     * another CPU doesn't see the status change yet.
+     */
+    mb();
+    if ( netif->status != DISCONNECTING )
+        BUG();
+    netif->status = DISCONNECTED;
+    mb();
+
+    /* Send the successful response. */
+    ctrl_if_send_response(&cmsg);
+}
+
+void netif_create(netif_be_create_t *create)
+{
+    domid_t            domid  = create->domid;
+    unsigned int       handle = create->netif_handle;
+    struct net_device *dev;
+    netif_t          **pnetif, *netif;
+    char               name[IFNAMSIZ];
+
+    snprintf(name, IFNAMSIZ, "vif%u.%u", domid, handle);
+    dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
+    if ( dev == NULL )
+    {
+        DPRINTK("Could not create netif: out of memory\n");
+        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    netif = dev->priv;
+    memset(netif, 0, sizeof(*netif));
+    netif->domid  = domid;
+    netif->handle = handle;
+    netif->status = DISCONNECTED;
+    spin_lock_init(&netif->rx_lock);
+    spin_lock_init(&netif->tx_lock);
+    atomic_set(&netif->refcnt, 0);
+    netif->dev = dev;
+
+    netif->credit_bytes = netif->remaining_credit = ~0UL;
+    netif->credit_usec  = 0UL;
+    /*init_ac_timer(&new_vif->credit_timeout);*/
+
+    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+    while ( *pnetif != NULL )
+    {
+        if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) )
+        {
+            DPRINTK("Could not create netif: already exists\n");
+            create->status = NETIF_BE_STATUS_INTERFACE_EXISTS;
+            kfree(dev);
+            return;
+        }
+        pnetif = &(*pnetif)->hash_next;
+    }
+
+    dev->hard_start_xmit = netif_be_start_xmit;
+    dev->get_stats       = netif_be_get_stats;
+    memcpy(dev->dev_addr, create->mac, ETH_ALEN);
+
+    /* Disable queuing. */
+    dev->tx_queue_len = 0;
+
+    /* XXX In bridge mode we should force a different MAC from remote end. */
+    dev->dev_addr[2] ^= 1;
+
+    if ( register_netdev(dev) != 0 )
+    {
+        DPRINTK("Could not register new net device\n");
+        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        kfree(dev);
+        return;
+    }
+
+    netif->hash_next = *pnetif;
+    *pnetif = netif;
+
+    DPRINTK("Successfully created netif\n");
+    create->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_destroy(netif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->netif_handle;
+    netif_t     **pnetif, *netif;
+
+    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+    while ( (netif = *pnetif) != NULL )
+    {
+        if ( (netif->domid == domid) && (netif->handle == handle) )
+        {
+            if ( netif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pnetif = &netif->hash_next;
+    }
+
+    destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pnetif = netif->hash_next;
+    unregister_netdev(netif->dev);
+    kfree(netif->dev);
+    destroy->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_connect(netif_be_connect_t *connect)
+{
+    domid_t       domid  = connect->domid;
+    unsigned int  handle = connect->netif_handle;
+    unsigned int  evtchn = connect->evtchn;
+    unsigned long tx_shmem_frame = connect->tx_shmem_frame;
+    unsigned long rx_shmem_frame = connect->rx_shmem_frame;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
+    netif_t      *netif;
+    struct net_device *eth0_dev;
+
+    netif = netif_find_by_handle(domid, handle);
+    if ( unlikely(netif == NULL) )
+    {
+        DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", 
+                connect->domid, connect->netif_handle); 
+        connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    if ( netif->status != DISCONNECTED )
+    {
+        connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+        return;
+    }
+
+    if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error  = direct_remap_area_pages(&init_mm, 
+                                     VMALLOC_VMADDR(vma->addr),
+                                     tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                     prot, domid);
+    error |= direct_remap_area_pages(&init_mm, 
+                                     VMALLOC_VMADDR(vma->addr) + PAGE_SIZE,
+                                     rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                     prot, domid);
+    if ( error != 0 )
+    {
+        if ( error == -ENOMEM )
+            connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT )
+            connect->status = NETIF_BE_STATUS_MAPPING_ERROR;
+        else
+            connect->status = NETIF_BE_STATUS_ERROR;
+        vfree(vma->addr);
+        return;
+    }
+
+    netif->evtchn         = evtchn;
+    netif->irq            = bind_evtchn_to_irq(evtchn);
+    netif->tx_shmem_frame = tx_shmem_frame;
+    netif->rx_shmem_frame = rx_shmem_frame;
+    netif->tx             = 
+        (netif_tx_interface_t *)vma->addr;
+    netif->rx             = 
+        (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE);
+    netif->status         = CONNECTED;
+    netif_get(netif);
+
+    rtnl_lock();
+
+    (void)dev_open(netif->dev);
+    (void)br_add_if(bridge_br, netif->dev);
+
+    /*
+     * The default config is a very simple binding to eth0.
+     * If eth0 is being used as an IP interface by this OS then someone
+     * must add eth0's IP address to nbe-br, and change the routing table
+     * to refer to nbe-br instead of eth0.
+     */
+    (void)dev_open(bridge_dev);
+    if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL )
+    {
+        (void)dev_open(eth0_dev);
+        (void)br_add_if(bridge_br, eth0_dev);
+    }
+
+    rtnl_unlock();
+
+    (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
+    netif_start_queue(netif->dev);
+
+    connect->status = NETIF_BE_STATUS_OKAY;
+}
+
+int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+    domid_t       domid  = disconnect->domid;
+    unsigned int  handle = disconnect->netif_handle;
+    netif_t      *netif;
+
+    netif = netif_find_by_handle(domid, handle);
+    if ( unlikely(netif == NULL) )
+    {
+        DPRINTK("netif_disconnect attempted for non-existent netif"
+                " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); 
+        disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return 1; /* Caller will send response error message. */
+    }
+
+    if ( netif->status == CONNECTED )
+    {
+        netif->status = DISCONNECTING;
+        netif->disconnect_rspid = rsp_id;
+        wmb(); /* Let other CPUs see the status change. */
+        netif_stop_queue(netif->dev);
+        free_irq(netif->irq, NULL);
+        netif_deschedule(netif);
+        netif_put(netif);
+    }
+
+    return 0; /* Caller should not send response message. */
+}
+
+void netif_interface_init(void)
+{
+    memset(netif_hash, 0, sizeof(netif_hash));
+    if ( br_add_bridge("nbe-br") != 0 )
+        BUG();
+    bridge_dev = __dev_get_by_name("nbe-br");
+    bridge_br  = (struct net_bridge *)bridge_dev->priv;
+    bridge_br->bridge_hello_time = bridge_br->hello_time = 0;
+    bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0;
+    bridge_br->stp_enabled = 0;
+}
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/main.c
new file mode 100644
index 0000000000..2c4e5cb211
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/main.c
@@ -0,0 +1,772 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/netif/frontend
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ */
+
+#include "common.h"
+
+static void netif_page_release(struct page *page);
+static void make_tx_response(netif_t *netif, 
+                             u16      id,
+                             s8       st);
+static int  make_rx_response(netif_t *netif, 
+                             u16      id, 
+                             s8       st,
+                             memory_t addr,
+                             u16      size);
+
+static void net_tx_action(unsigned long unused);
+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
+
+static void net_rx_action(unsigned long unused);
+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+
+typedef struct {
+    u16 id;
+    unsigned long old_mach_ptr;
+    unsigned long new_mach_pfn;
+    netif_t *netif;
+} rx_info_t;
+static struct sk_buff_head rx_queue;
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3];
+static unsigned char rx_notify[NR_EVENT_CHANNELS];
+
+/* Don't currently gate addition of an interface to the tx scheduling list. */
+#define tx_work_exists(_if) (1)
+
+#define MAX_PENDING_REQS 256
+static unsigned long mmap_vstart;
+#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
+
+#define PKT_PROT_LEN (ETH_HLEN + 20)
+
+static u16 pending_id[MAX_PENDING_REQS];
+static netif_t *pending_netif[MAX_PENDING_REQS];
+static u16 pending_ring[MAX_PENDING_REQS];
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
+static u16 dealloc_ring[MAX_PENDING_REQS];
+static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED;
+static PEND_RING_IDX dealloc_prod, dealloc_cons;
+
+typedef struct {
+    u16 idx;
+    netif_tx_request_t req;
+    netif_t *netif;
+} tx_info_t;
+static struct sk_buff_head tx_queue;
+static multicall_entry_t tx_mcl[MAX_PENDING_REQS];
+
+static struct list_head net_schedule_list;
+static spinlock_t net_schedule_list_lock;
+
+#define MAX_MFN_ALLOC 64
+static unsigned long mfn_list[MAX_MFN_ALLOC];
+static unsigned int alloc_index = 0;
+static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
+
+static void __refresh_mfn_list(void)
+{
+    int ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+                                    mfn_list, MAX_MFN_ALLOC);
+    if ( unlikely(ret != MAX_MFN_ALLOC) )
+    {
+        printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret);
+        BUG();
+    }
+    alloc_index = MAX_MFN_ALLOC;
+}
+
+static unsigned long get_new_mfn(void)
+{
+    unsigned long mfn, flags;
+    spin_lock_irqsave(&mfn_lock, flags);
+    if ( alloc_index == 0 )
+        __refresh_mfn_list();
+    mfn = mfn_list[--alloc_index];
+    spin_unlock_irqrestore(&mfn_lock, flags);
+    return mfn;
+}
+
+static void dealloc_mfn(unsigned long mfn)
+{
+    unsigned long flags;
+    spin_lock_irqsave(&mfn_lock, flags);
+    if ( alloc_index != MAX_MFN_ALLOC )
+        mfn_list[alloc_index++] = mfn;
+    else
+        (void)HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, &mfn, 1);
+    spin_unlock_irqrestore(&mfn_lock, flags);
+}
+
+static inline void maybe_schedule_tx_action(void)
+{
+    smp_mb();
+    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+         !list_empty(&net_schedule_list) )
+        tasklet_schedule(&net_tx_tasklet);
+}
+
+/*
+ * This is the primary RECEIVE function for a network interface.
+ * Note that, from the p.o.v. of /this/ OS it looks like a transmit.
+ */
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+    netif_t *netif = (netif_t *)dev->priv;
+
+    /* Drop the packet if the target domain has no receive buffers. */
+    if ( (netif->rx_req_cons == netif->rx->req_prod) ||
+         ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) )
+        goto drop;
+
+    /*
+     * We do not copy the packet unless:
+     *  1. The data is shared; or
+     *  2. It spans a page boundary; or
+     *  3. We cannot be sure the whole data page is allocated.
+     * The copying method is taken from skb_copy().
+     * NB. We also couldn't cope with fragmented packets, but we won't get
+     *     any because we not advertise the NETIF_F_SG feature.
+     */
+    if ( skb_shared(skb) || skb_cloned(skb) || 
+         (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
+         ((skb->end - skb->head) < (PAGE_SIZE/2)) )
+    {
+        struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
+        int hlen = skb->data - skb->head;
+        if ( unlikely(nskb == NULL) )
+            goto drop;
+        skb_reserve(nskb, hlen);
+        __skb_put(nskb, skb->len);
+        (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
+        dev_kfree_skb(skb);
+        skb = nskb;
+    }
+
+    ((rx_info_t *)&skb->cb[0])->id    =
+        netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_req_cons++)].req.id;
+    ((rx_info_t *)&skb->cb[0])->netif = netif;
+        
+    __skb_queue_tail(&rx_queue, skb);
+    tasklet_schedule(&net_rx_tasklet);
+
+    return 0;
+
+ drop:
+    netif->stats.rx_dropped++;
+    dev_kfree_skb(skb);
+    return 0;
+}
+
+#if 0
+static void xen_network_done_notify(void)
+{
+    static struct net_device *eth0_dev = NULL;
+    if ( unlikely(eth0_dev == NULL) )
+        eth0_dev = __dev_get_by_name("eth0");
+    netif_rx_schedule(eth0_dev);
+}
+/* 
+ * Add following to poll() function in NAPI driver (Tigon3 is example):
+ *  if ( xen_network_done() )
+ *      tg3_enable_ints(tp); 
+ */
+int xen_network_done(void)
+{
+    return skb_queue_empty(&rx_queue);
+}
+#endif
+
+static void net_rx_action(unsigned long unused)
+{
+    netif_t *netif;
+    s8 status;
+    u16 size, id, evtchn;
+    mmu_update_t *mmu = rx_mmu;
+    multicall_entry_t *mcl;
+    unsigned long vdata, mdata, new_mfn;
+    struct sk_buff_head rxq;
+    struct sk_buff *skb;
+    u16 notify_list[NETIF_RX_RING_SIZE];
+    int notify_nr = 0;
+
+    skb_queue_head_init(&rxq);
+
+    mcl = rx_mcl;
+    while ( (skb = __skb_dequeue(&rx_queue)) != NULL )
+    {
+        netif   = ((rx_info_t *)&skb->cb[0])->netif;
+        vdata   = (unsigned long)skb->data;
+        mdata   = virt_to_machine(vdata);
+        new_mfn = get_new_mfn();
+        
+        mmu[0].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+        mmu[0].val  = __pa(vdata) >> PAGE_SHIFT;        
+        mmu[1].val  = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
+        mmu[1].ptr  = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
+        mmu[1].ptr |= MMU_EXTENDED_COMMAND;
+        mmu[1].val |= MMUEXT_SET_SUBJECTDOM;
+        mmu[2].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
+        mmu[2].val  = MMUEXT_REASSIGN_PAGE;
+
+        mcl[0].op = __HYPERVISOR_update_va_mapping;
+        mcl[0].args[0] = vdata >> PAGE_SHIFT;
+        mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
+        mcl[0].args[2] = 0;
+        mcl[1].op = __HYPERVISOR_mmu_update;
+        mcl[1].args[0] = (unsigned long)mmu;
+        mcl[1].args[1] = 3;
+        mcl[1].args[2] = 0;
+
+        mmu += 3;
+        mcl += 2;
+
+        ((rx_info_t *)&skb->cb[0])->old_mach_ptr = mdata;
+        ((rx_info_t *)&skb->cb[0])->new_mach_pfn = new_mfn;
+        __skb_queue_tail(&rxq, skb);
+
+        /* Filled the batch queue? */
+        if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) )
+            break;
+    }
+
+    if ( mcl == rx_mcl )
+        return;
+
+    mcl[-2].args[2] = UVMF_FLUSH_TLB;
+    (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
+
+    mcl = rx_mcl;
+    while ( (skb = __skb_dequeue(&rxq)) != NULL )
+    {
+        netif   = ((rx_info_t *)&skb->cb[0])->netif;
+        size    = skb->tail - skb->data;
+        id      = ((rx_info_t *)&skb->cb[0])->id;
+        new_mfn = ((rx_info_t *)&skb->cb[0])->new_mach_pfn;
+        mdata   = ((rx_info_t *)&skb->cb[0])->old_mach_ptr;
+
+        /* Check the reassignment error code. */
+        if ( unlikely(mcl[1].args[5] != 0) )
+        {
+            DPRINTK("Failed MMU update transferring to DOM%u\n",
+                    netif->domid);
+            (void)HYPERVISOR_update_va_mapping(
+                (unsigned long)skb->head >> PAGE_SHIFT,
+                (pte_t) { (mdata & PAGE_MASK) | __PAGE_KERNEL },
+                UVMF_INVLPG);
+            dealloc_mfn(new_mfn);
+            status = NETIF_RSP_ERROR;
+        }
+        else
+        {
+            phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
+
+            atomic_set(&(skb_shinfo(skb)->dataref), 1);
+            skb_shinfo(skb)->nr_frags = 0;
+            skb_shinfo(skb)->frag_list = NULL;
+
+            netif->stats.rx_bytes += size;
+            netif->stats.rx_packets++;
+
+            status = NETIF_RSP_OKAY;
+        }
+
+        evtchn = netif->evtchn;
+        if ( make_rx_response(netif, id, status, mdata, size) &&
+             (rx_notify[evtchn] == 0) )
+        {
+            rx_notify[evtchn] = 1;
+            notify_list[notify_nr++] = evtchn;
+        }
+
+        dev_kfree_skb(skb);
+
+        mcl += 2;
+    }
+
+    while ( notify_nr != 0 )
+    {
+        evtchn = notify_list[--notify_nr];
+        rx_notify[evtchn] = 0;
+        notify_via_evtchn(evtchn);
+    }
+
+    /* More work to do? */
+    if ( !skb_queue_empty(&rx_queue) )
+        tasklet_schedule(&net_rx_tasklet);
+#if 0
+    else
+        xen_network_done_notify();
+#endif
+}
+
+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
+{
+    netif_t *netif = dev->priv;
+    return &netif->stats;
+}
+
+static int __on_net_schedule_list(netif_t *netif)
+{
+    return netif->list.next != NULL;
+}
+
+static void remove_from_net_schedule_list(netif_t *netif)
+{
+    spin_lock_irq(&net_schedule_list_lock);
+    if ( likely(__on_net_schedule_list(netif)) )
+    {
+        list_del(&netif->list);
+        netif->list.next = NULL;
+        netif_put(netif);
+    }
+    spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static void add_to_net_schedule_list_tail(netif_t *netif)
+{
+    if ( __on_net_schedule_list(netif) )
+        return;
+
+    spin_lock_irq(&net_schedule_list_lock);
+    if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) )
+    {
+        list_add_tail(&netif->list, &net_schedule_list);
+        netif_get(netif);
+    }
+    spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static inline void netif_schedule_work(netif_t *netif)
+{
+    if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+         ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+    {
+        add_to_net_schedule_list_tail(netif);
+        maybe_schedule_tx_action();
+    }
+}
+
+void netif_deschedule(netif_t *netif)
+{
+    remove_from_net_schedule_list(netif);
+}
+
+#if 0
+static void tx_credit_callback(unsigned long data)
+{
+    netif_t *netif = (netif_t *)data;
+    netif->remaining_credit = netif->credit_bytes;
+    netif_schedule_work(netif);
+}
+#endif
+
+static void net_tx_action(unsigned long unused)
+{
+    struct list_head *ent;
+    struct sk_buff *skb;
+    netif_t *netif;
+    netif_tx_request_t txreq;
+    u16 pending_idx;
+    NETIF_RING_IDX i;
+    struct page *page;
+    multicall_entry_t *mcl;
+
+    if ( (i = dealloc_cons) == dealloc_prod )
+        goto skip_dealloc;
+
+    mcl = tx_mcl;
+    while ( i != dealloc_prod )
+    {
+        pending_idx = dealloc_ring[MASK_PEND_IDX(i++)];
+        mcl[0].op = __HYPERVISOR_update_va_mapping;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[1] = 0;
+        mcl[0].args[2] = 0;
+        mcl++;        
+    }
+
+    mcl[-1].args[2] = UVMF_FLUSH_TLB;
+    (void)HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl);
+
+    while ( dealloc_cons != dealloc_prod )
+    {
+        pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
+
+        netif = pending_netif[pending_idx];
+
+        spin_lock(&netif->tx_lock);
+        make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY);
+        spin_unlock(&netif->tx_lock);
+        
+        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+        
+        /*
+         * Scheduling checks must happen after the above response is posted.
+         * This avoids a possible race with a guest OS on another CPU.
+         */
+        mb();
+        if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+             ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+            add_to_net_schedule_list_tail(netif);
+        
+        netif_put(netif);
+    }
+
+ skip_dealloc:
+    mcl = tx_mcl;
+    while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+            !list_empty(&net_schedule_list) )
+    {
+        /* Get a netif from the list with work to do. */
+        ent = net_schedule_list.next;
+        netif = list_entry(ent, netif_t, list);
+        netif_get(netif);
+        remove_from_net_schedule_list(netif);
+
+        /* Work to do? */
+        i = netif->tx_req_cons;
+        if ( (i == netif->tx->req_prod) ||
+             ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
+        {
+            netif_put(netif);
+            continue;
+        }
+        memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, 
+               sizeof(txreq));
+        netif->tx_req_cons++;
+
+#if 0
+        /* Credit-based scheduling. */
+        if ( tx.size > netif->remaining_credit )
+        {
+            s_time_t now = NOW(), next_credit = 
+                netif->credit_timeout.expires + MICROSECS(netif->credit_usec);
+            if ( next_credit <= now )
+            {
+                netif->credit_timeout.expires = now;
+                netif->remaining_credit = netif->credit_bytes;
+            }
+            else
+            {
+                netif->remaining_credit = 0;
+                netif->credit_timeout.expires  = next_credit;
+                netif->credit_timeout.data     = (unsigned long)netif;
+                netif->credit_timeout.function = tx_credit_callback;
+                netif->credit_timeout.cpu      = smp_processor_id();
+                add_ac_timer(&netif->credit_timeout);
+                break;
+            }
+        }
+        netif->remaining_credit -= tx.size;
+#endif
+
+        netif_schedule_work(netif);
+
+        if ( unlikely(txreq.size <= PKT_PROT_LEN) || 
+             unlikely(txreq.size > ETH_FRAME_LEN) )
+        {
+            DPRINTK("Bad packet size: %d\n", txreq.size);
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            continue; 
+        }
+
+        /* No crossing a page boundary as the payload mustn't fragment. */
+        if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) 
+        {
+            DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", 
+                    txreq.addr, txreq.size, 
+                    (txreq.addr &~PAGE_MASK) + txreq.size);
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            continue;
+        }
+
+        pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+        if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) )
+        {
+            DPRINTK("Can't allocate a skb in start_xmit.\n");
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            break;
+        }
+
+        mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
+        mcl[0].args[2] = 0;
+        mcl[0].args[3] = netif->domid;
+        mcl++;
+        
+        ((tx_info_t *)&skb->cb[0])->idx = pending_idx;
+        ((tx_info_t *)&skb->cb[0])->netif = netif;
+        memcpy(&((tx_info_t *)&skb->cb[0])->req, &txreq, sizeof(txreq));
+        __skb_queue_tail(&tx_queue, skb);
+
+        pending_cons++;
+
+        /* Filled the batch queue? */
+        if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) )
+            break;
+    }
+
+    if ( mcl == tx_mcl )
+        return;
+
+    (void)HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl);
+
+    mcl = tx_mcl;
+    while ( (skb = __skb_dequeue(&tx_queue)) != NULL )
+    {
+        pending_idx = ((tx_info_t *)&skb->cb[0])->idx;
+        netif       = ((tx_info_t *)&skb->cb[0])->netif;
+        memcpy(&txreq, &((tx_info_t *)&skb->cb[0])->req, sizeof(txreq));
+
+        /* Check the remap error code. */
+        if ( unlikely(mcl[0].args[5] != 0) )
+        {
+            DPRINTK("Bad page frame\n");
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            kfree_skb(skb);
+            mcl++;
+            pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+            continue;
+        }
+
+        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
+            txreq.addr >> PAGE_SHIFT;
+
+        __skb_put(skb, PKT_PROT_LEN);
+        memcpy(skb->data, 
+               (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
+               PKT_PROT_LEN);
+
+        page = virt_to_page(MMAP_VADDR(pending_idx));
+
+        /* Append the packet payload as a fragment. */
+        skb_shinfo(skb)->frags[0].page        = page;
+        skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
+        skb_shinfo(skb)->frags[0].page_offset = 
+            (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
+        skb_shinfo(skb)->nr_frags = 1;
+        skb->data_len  = txreq.size - PKT_PROT_LEN;
+        skb->len      += skb->data_len;
+
+        skb->dev      = netif->dev;
+        skb->protocol = eth_type_trans(skb, skb->dev);
+
+        /*
+         * Destructor information. We hideously abuse the 'mapping' pointer,
+         * which isn't otherwise used by us. The page deallocator is modified
+         * to interpret a non-NULL value as a destructor function to be called.
+         * This works okay because in all other cases the pointer must be NULL
+         * when the page is freed (normally Linux will explicitly bug out if
+         * it sees otherwise.
+         */
+        page->mapping = (struct address_space *)netif_page_release;
+        atomic_set(&page->count, 1);
+        pending_id[pending_idx] = txreq.id;
+        pending_netif[pending_idx] = netif;
+
+        netif->stats.tx_bytes += txreq.size;
+        netif->stats.tx_packets++;
+
+        netif_rx(skb);
+        netif->dev->last_rx = jiffies;
+
+        mcl++;
+    }
+}
+
+static void netif_page_release(struct page *page)
+{
+    unsigned long flags;
+    u16 pending_idx = page - virt_to_page(mmap_vstart);
+
+    /* Stop the abuse. */
+    page->mapping = NULL;
+
+    spin_lock_irqsave(&dealloc_lock, flags);
+    dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
+    spin_unlock_irqrestore(&dealloc_lock, flags);
+
+    tasklet_schedule(&net_tx_tasklet);
+}
+
+#if 0
+long flush_bufs_for_netif(netif_t *netif)
+{
+    NET_RING_IDX i;
+
+    /* Return any outstanding receive buffers to the guest OS. */
+    spin_lock(&netif->rx_lock);
+    for ( i = netif->rx_req_cons; 
+          (i != netif->rx->req_prod) &&
+              ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE);
+          i++ )
+    {
+        make_rx_response(netif,
+                         netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id,
+                         NETIF_RSP_DROPPED, 0, 0);
+    }
+    netif->rx_req_cons = i;
+    spin_unlock(&netif->rx_lock);
+
+    /*
+     * Flush pending transmit buffers. The guest may still have to wait for
+     * buffers that are queued at a physical NIC.
+     */
+    spin_lock(&netif->tx_lock);
+    for ( i = netif->tx_req_cons; 
+          (i != netif->tx->req_prod) &&
+              ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE);
+          i++ )
+    {
+        make_tx_response(netif,
+                         netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id,
+                         NETIF_RSP_DROPPED);
+    }
+    netif->tx_req_cons = i;
+    spin_unlock(&netif->tx_lock);
+
+    return 0;
+}
+#endif
+
+void netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    netif_t *netif = dev_id;
+    if ( tx_work_exists(netif) )
+    {
+        add_to_net_schedule_list_tail(netif);
+        maybe_schedule_tx_action();
+    }
+}
+
+static void make_tx_response(netif_t *netif, 
+                             u16      id,
+                             s8       st)
+{
+    NET_RING_IDX i = netif->tx_resp_prod;
+    netif_tx_response_t *resp;
+
+    resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp;
+    resp->id     = id;
+    resp->status = st;
+    wmb();
+    netif->tx->resp_prod = netif->tx_resp_prod = ++i;
+
+    mb(); /* Update producer before checking event threshold. */
+    if ( i == netif->tx->event )
+        notify_via_evtchn(netif->evtchn);
+}
+
+static int make_rx_response(netif_t *netif, 
+                            u16      id, 
+                            s8       st,
+                            memory_t addr,
+                            u16      size)
+{
+    NET_RING_IDX i = netif->rx_resp_prod;
+    netif_rx_response_t *resp;
+
+    resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
+    resp->addr   = addr;
+    resp->id     = id;
+    resp->status = (s16)size;
+    if ( st < 0 )
+        resp->status = (s16)st;
+    wmb();
+    netif->rx->resp_prod = netif->rx_resp_prod = ++i;
+
+    mb(); /* Update producer before checking event threshold. */
+    return (i == netif->rx->event);
+}
+
+static void netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
+{
+    struct list_head *ent;
+    netif_t *netif;
+    int i = 0;
+
+    printk(KERN_ALERT "netif_schedule_list:\n");
+    spin_lock_irq(&net_schedule_list_lock);
+
+    list_for_each ( ent, &net_schedule_list )
+    {
+        netif = list_entry(ent, netif_t, list);
+        printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n",
+               i, netif->rx_req_cons, netif->rx_resp_prod);               
+        printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
+               netif->tx_req_cons, netif->tx_resp_prod);
+        printk(KERN_ALERT "   shared(rx_req_prod=%08x rx_resp_prod=%08x\n",
+               netif->rx->req_prod, netif->rx->resp_prod);
+        printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
+               netif->rx->event, netif->tx->req_prod);
+        printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
+               netif->tx->resp_prod, netif->tx->event);
+        i++;
+    }
+
+    spin_unlock_irq(&net_schedule_list_lock);
+    printk(KERN_ALERT " ** End of netif_schedule_list **\n");
+}
+
+static int __init init_module(void)
+{
+    int i;
+
+    if ( !(start_info.flags & SIF_NET_BE_DOMAIN) &&
+	 !(start_info.flags & SIF_INITDOMAIN) )
+        return 0;
+
+    printk("Initialising Xen netif backend\n");
+
+    skb_queue_head_init(&rx_queue);
+    skb_queue_head_init(&tx_queue);
+
+    netif_interface_init();
+
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
+        BUG();
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+
+    spin_lock_init(&net_schedule_list_lock);
+    INIT_LIST_HEAD(&net_schedule_list);
+
+    netif_ctrlif_init();
+
+    (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG),
+                      netif_be_dbg, SA_SHIRQ, 
+                      "net-be-dbg", &netif_be_dbg);
+
+    return 0;
+}
+
+static void cleanup_module(void)
+{
+    BUG();
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/Makefile
new file mode 100644
index 0000000000..032d02d7cc
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := main.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/main.c
new file mode 100644
index 0000000000..4d4c579703
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/main.c
@@ -0,0 +1,777 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/frontend/main.c
+ * 
+ * Virtual network driver for XenoLinux.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+#include <asm/evtchn.h>
+#include <asm/ctrl_if.h>
+
+#include <asm/page.h>
+
+#include "../netif.h"
+
+#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
+
+static void network_tx_buf_gc(struct net_device *dev);
+static void network_alloc_rx_buffers(struct net_device *dev);
+static void cleanup_module(void);
+
+static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
+
+static struct list_head dev_list;
+
+struct net_private
+{
+    struct list_head list;
+    struct net_device *dev;
+
+    struct net_device_stats stats;
+    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
+    unsigned int tx_full;
+    
+    netif_tx_interface_t *tx;
+    netif_rx_interface_t *rx;
+
+    spinlock_t   tx_lock;
+    spinlock_t   rx_lock;
+
+    unsigned int handle;
+    unsigned int evtchn;
+    unsigned int irq;
+
+#define NETIF_STATE_CLOSED       0
+#define NETIF_STATE_DISCONNECTED 1
+#define NETIF_STATE_CONNECTED    2
+#define NETIF_STATE_ACTIVE       3
+    unsigned int state;
+
+    /*
+     * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
+     * array is an index into a chain of free entries.
+     */
+    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
+    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
+};
+
+/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
+#define ADD_ID_TO_FREELIST(_list, _id)             \
+    (_list)[(_id)] = (_list)[0];                   \
+    (_list)[0]     = (void *)(unsigned long)(_id);
+#define GET_ID_FROM_FREELIST(_list)                \
+ ({ unsigned long _id = (unsigned long)(_list)[0]; \
+    (_list)[0]  = (_list)[_id];                    \
+    (unsigned short)_id; })
+
+static struct net_device *find_dev_by_handle(unsigned int handle)
+{
+    struct list_head *ent;
+    struct net_private *np;
+    list_for_each ( ent, &dev_list )
+    {
+        np = list_entry(ent, struct net_private, list);
+        if ( np->handle == handle )
+            return np->dev;
+    }
+    return NULL;
+}
+
+
+static int network_open(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+    int i;
+
+    if ( np->state != NETIF_STATE_CONNECTED )
+        return -EINVAL;
+
+    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
+    memset(&np->stats, 0, sizeof(np->stats));
+    spin_lock_init(&np->tx_lock);
+    spin_lock_init(&np->rx_lock);
+
+    /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
+    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
+        np->tx_skbs[i] = (void *)(i+1);
+    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
+        np->rx_skbs[i] = (void *)(i+1);
+
+    wmb();
+    np->state = NETIF_STATE_ACTIVE;
+
+    network_alloc_rx_buffers(dev);
+    np->rx->event = np->rx_resp_cons + 1;
+
+    netif_start_queue(dev);
+
+    MOD_INC_USE_COUNT;
+
+    return 0;
+}
+
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+    NETIF_RING_IDX i, prod;
+    unsigned short id;
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+
+    do {
+        prod = np->tx->resp_prod;
+
+        for ( i = np->tx_resp_cons; i != prod; i++ )
+        {
+            id  = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id;
+            skb = np->tx_skbs[id];
+            ADD_ID_TO_FREELIST(np->tx_skbs, id);
+            dev_kfree_skb_any(skb);
+        }
+        
+        np->tx_resp_cons = prod;
+        
+        /*
+         * Set a new event, then check for race with update of tx_cons. Note
+         * that it is essential to schedule a callback, no matter how few
+         * buffers are pending. Even if there is space in the transmit ring,
+         * higher layers may be blocked because too much data is outstanding:
+         * in such cases notification from Xen is likely to be the only kick
+         * that we'll get.
+         */
+        np->tx->event = 
+            prod + ((np->tx->req_prod - prod) >> 1) + 1;
+        mb();
+    }
+    while ( prod != np->tx->resp_prod );
+
+    if ( np->tx_full && 
+         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
+    {
+        np->tx_full = 0;
+        if ( np->state == NETIF_STATE_ACTIVE )
+            netif_wake_queue(dev);
+    }
+}
+
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+    unsigned short id;
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+    NETIF_RING_IDX i = np->rx->req_prod;
+    int nr_pfns = 0;
+
+    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
+    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
+         unlikely(np->state != NETIF_STATE_ACTIVE) )
+        return;
+
+    do {
+        skb = dev_alloc_skb(RX_BUF_SIZE);
+        if ( unlikely(skb == NULL) )
+            break;
+
+        skb->dev = dev;
+
+        if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
+            panic("alloc_skb needs to provide us page-aligned buffers.");
+
+        id = GET_ID_FROM_FREELIST(np->rx_skbs);
+
+        np->rx_skbs[id] = skb;
+        
+        np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id;
+        
+        rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
+
+        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+        rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+        rx_mcl[nr_pfns].args[1] = 0;
+        rx_mcl[nr_pfns].args[2] = 0;
+
+        nr_pfns++;
+    }
+    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
+
+    /*
+     * We may have allocated buffers which have entries outstanding in the page
+     * update queue -- make sure we flush those first!
+     */
+    flush_page_update_queue();
+
+    /* After all PTEs have been zapped we blow away stale TLB entries. */
+    rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+    /* Give away a batch of pages. */
+    rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+    rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+    rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+    rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+
+    /* Zap PTEs and give away pages in one big multicall. */
+    (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+    /* Check return status of HYPERVISOR_dom_mem_op(). */
+    if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
+        panic("Unable to reduce memory reservation\n");
+
+    np->rx->req_prod = i;
+}
+
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+    unsigned short id;
+    struct net_private *np = (struct net_private *)dev->priv;
+    netif_tx_request_t *tx;
+    NETIF_RING_IDX i;
+
+    if ( unlikely(np->tx_full) )
+    {
+        printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
+        netif_stop_queue(dev);
+        return -ENOBUFS;
+    }
+
+    if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
+                  PAGE_SIZE) )
+    {
+        struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
+        if ( unlikely(new_skb == NULL) )
+            return 1;
+        skb_put(new_skb, skb->len);
+        memcpy(new_skb->data, skb->data, skb->len);
+        dev_kfree_skb(skb);
+        skb = new_skb;
+    }
+    
+    spin_lock_irq(&np->tx_lock);
+
+    /* if the backend isn't available then don't do anything! */
+    if ( !netif_carrier_ok(dev) )
+    {
+        spin_unlock_irq(&np->tx_lock);
+        return 1;
+    }
+
+    i = np->tx->req_prod;
+
+    id = GET_ID_FROM_FREELIST(np->tx_skbs);
+    np->tx_skbs[id] = skb;
+
+    tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req;
+
+    tx->id   = id;
+    tx->addr = virt_to_machine(skb->data);
+    tx->size = skb->len;
+
+    wmb();
+    np->tx->req_prod = i + 1;
+
+    network_tx_buf_gc(dev);
+
+    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
+    {
+        np->tx_full = 1;
+        netif_stop_queue(dev);
+    }
+
+    spin_unlock_irq(&np->tx_lock);
+
+    np->stats.tx_bytes += skb->len;
+    np->stats.tx_packets++;
+
+    /* Only notify Xen if there are no outstanding responses. */
+    mb();
+    if ( np->tx->resp_prod == i )
+        notify_via_evtchn(np->evtchn);
+
+    return 0;
+}
+
+
+static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    struct net_device *dev = dev_id;
+    struct net_private *np = dev->priv;
+    unsigned long flags;
+
+    spin_lock_irqsave(&np->tx_lock, flags);
+    
+    if( !netif_carrier_ok(dev) )
+    {
+        spin_unlock_irqrestore(&np->tx_lock, flags);
+        return;
+    }
+    
+    network_tx_buf_gc(dev);
+    spin_unlock_irqrestore(&np->tx_lock, flags);
+
+    if ( np->rx_resp_cons != np->rx->resp_prod )
+        netif_rx_schedule(dev);
+}
+
+
+static int netif_poll(struct net_device *dev, int *pbudget)
+{
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+    netif_rx_response_t *rx;
+    NETIF_RING_IDX i;
+    mmu_update_t *mmu = rx_mmu;
+    multicall_entry_t *mcl = rx_mcl;
+    int work_done, budget, more_to_do = 1;
+    struct sk_buff_head rxq;
+    unsigned long flags;
+
+    spin_lock(&np->rx_lock);
+
+    /* if the device is undergoing recovery then don't do anything */
+    if ( !netif_carrier_ok(dev) )
+    {
+        spin_unlock(&np->rx_lock);
+        return 0;
+    }
+
+    skb_queue_head_init(&rxq);
+
+    if ( (budget = *pbudget) > dev->quota )
+        budget = dev->quota;
+
+    for ( i = np->rx_resp_cons, work_done = 0; 
+          (i != np->rx->resp_prod) && (work_done < budget); 
+          i++, work_done++ )
+    {
+        rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp;
+
+        skb = np->rx_skbs[rx->id];
+        ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
+
+        if ( unlikely(rx->status <= 0) )
+        {
+            /* Gate this error. We get a (valid) slew of them on suspend. */
+            if ( np->state == NETIF_STATE_ACTIVE )
+                printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
+            dev_kfree_skb(skb);
+            continue;
+        }
+
+        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
+        skb_put(skb, rx->status);
+
+        np->stats.rx_packets++;
+        np->stats.rx_bytes += rx->status;
+
+        /* Remap the page. */
+        mmu->ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
+        mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
+        mmu++;
+        mcl->op = __HYPERVISOR_update_va_mapping;
+        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+        mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
+        mcl->args[2] = 0;
+        mcl++;
+
+        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
+            rx->addr >> PAGE_SHIFT;
+
+        __skb_queue_tail(&rxq, skb);
+    }
+
+    /* Do all the remapping work, and M->P updates, in one big hypercall. */
+    if ( likely((mcl - rx_mcl) != 0) )
+    {
+        mcl->op = __HYPERVISOR_mmu_update;
+        mcl->args[0] = (unsigned long)rx_mmu;
+        mcl->args[1] = mmu - rx_mmu;
+        mcl->args[2] = 0;
+        mcl++;
+        (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
+    }
+
+    while ( (skb = __skb_dequeue(&rxq)) != NULL )
+    {
+        /* Set the shared-info area, which is hidden behind the real data. */
+        atomic_set(&(skb_shinfo(skb)->dataref), 1);
+        skb_shinfo(skb)->nr_frags = 0;
+        skb_shinfo(skb)->frag_list = NULL;
+
+        /* Ethernet-specific work. Delayed to here as it peeks the header. */
+        skb->protocol = eth_type_trans(skb, dev);
+
+        /* Pass it up. */
+        netif_rx(skb);
+        dev->last_rx = jiffies;
+    }
+
+    np->rx_resp_cons = i;
+
+    network_alloc_rx_buffers(dev);
+
+    *pbudget   -= work_done;
+    dev->quota -= work_done;
+
+    if ( work_done < budget )
+    {
+        local_irq_save(flags);
+
+        np->rx->event = i + 1;
+    
+        /* Deal with hypervisor racing our resetting of rx_event. */
+        mb();
+        if ( np->rx->resp_prod == i )
+        {
+            __netif_rx_complete(dev);
+            more_to_do = 0;
+        }
+
+        local_irq_restore(flags);
+    }
+
+    spin_unlock(&np->rx_lock);
+
+    return more_to_do;
+}
+
+
+static int network_close(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+
+    netif_stop_queue(np->dev);
+
+    np->state = NETIF_STATE_CONNECTED;
+
+    /* XXX We need to properly disconnect via the domain controller. */
+    while ( /*(np->rx_resp_cons != np->rx->req_prod) ||*/
+            (np->tx_resp_cons != np->tx->req_prod) )
+    {
+        barrier();
+        current->state = TASK_INTERRUPTIBLE;
+        schedule_timeout(1);
+    }
+
+    MOD_DEC_USE_COUNT;
+
+    return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+    struct net_private *np = (struct net_private *)dev->priv;
+    return &np->stats;
+}
+
+
+static void netif_status_change(netif_fe_interface_status_changed_t *status)
+{
+    ctrl_msg_t                   cmsg;
+    netif_fe_interface_connect_t up;
+    struct net_device *dev;
+    struct net_private *np;
+
+    if ( status->handle != 0 )
+    {
+        printk(KERN_WARNING "Status change on unsupported netif %d\n",
+               status->handle);
+        return;
+    }
+
+    dev = find_dev_by_handle(0);
+    np  = dev->priv;
+    
+    switch ( status->status )
+    {
+    case NETIF_INTERFACE_STATUS_DESTROYED:
+        printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
+               np->state);
+        break;
+
+    case NETIF_INTERFACE_STATUS_DISCONNECTED:
+        if ( np->state != NETIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
+                   " in state %d\n", np->state);
+	    printk(KERN_INFO "Attempting to reconnect network interface\n");
+
+            /* Begin interface recovery.
+	     *
+	     * NB. Whilst we're recovering, we turn the carrier state off.  We
+	     * take measures to ensure that this device isn't used for
+	     * anything.  We also stop the queue for this device.  Various
+	     * different approaches (e.g. continuing to buffer packets) have
+	     * been tested but don't appear to improve the overall impact on
+             * TCP connections.
+	     *
+             * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
+             * is initiated by a special "RESET" message - disconnect could
+             * just mean we're not allowed to use this interface any more.
+             */
+
+            /* Stop old i/f to prevent errors whilst we rebuild the state. */
+            spin_lock_irq(&np->tx_lock);
+            spin_lock(&np->rx_lock);
+            netif_stop_queue(dev);
+            netif_carrier_off(dev);
+            np->state  = NETIF_STATE_DISCONNECTED;
+            spin_unlock(&np->rx_lock);
+            spin_unlock_irq(&np->tx_lock);
+
+            /* Free resources. */
+            free_irq(np->irq, dev);
+            unbind_evtchn_from_irq(np->evtchn);
+	    free_page((unsigned long)np->tx);
+            free_page((unsigned long)np->rx);
+        }
+
+        /* Move from CLOSED to DISCONNECTED state. */
+        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
+        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
+        memset(np->tx, 0, PAGE_SIZE);
+        memset(np->rx, 0, PAGE_SIZE);
+        np->state  = NETIF_STATE_DISCONNECTED;
+
+        /* Construct an interface-CONNECT message for the domain controller. */
+        cmsg.type      = CMSG_NETIF_FE;
+        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
+        cmsg.length    = sizeof(netif_fe_interface_connect_t);
+        up.handle      = 0;
+        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
+        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
+        memcpy(cmsg.msg, &up, sizeof(up));
+        
+        /* Tell the controller to bring up the interface. */
+        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+        break;
+
+    case NETIF_INTERFACE_STATUS_CONNECTED:
+        if ( np->state == NETIF_STATE_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected netif-CONNECTED message"
+                   " in state %d\n", np->state);
+            break;
+        }
+
+        memcpy(dev->dev_addr, status->mac, ETH_ALEN);
+
+        if(netif_carrier_ok(dev))
+            np->state = NETIF_STATE_CONNECTED;
+        else
+        {
+            int i, requeue_idx;
+            netif_tx_request_t *tx;
+
+            spin_lock_irq(&np->rx_lock);
+            spin_lock(&np->tx_lock);
+
+            /* Recovery procedure: */
+
+            /* Step 1: Reinitialise variables. */
+            np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
+            np->rx->event = 1;
+
+            /* Step 2: Rebuild the RX and TX ring contents.
+             * NB. We could just free the queued TX packets now but we hope
+             * that sending them out might do some good.  We have to rebuild
+             * the RX ring because some of our pages are currently flipped out
+             * so we can't just free the RX skbs.
+	     * NB2. Freelist index entries are always going to be less than
+	     *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
+	     * greater than __PAGE_OFFSET: we use this property to distinguish
+             * them.
+             */
+
+            /* Rebuild the TX buffer freelist and the TX ring itself.
+             * NB. This reorders packets.  We could keep more private state
+             * to avoid this but maybe it doesn't matter so much given the
+             * interface has been down.
+             */
+            for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
+            {
+                if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
+                {
+                    struct sk_buff *skb = np->tx_skbs[i];
+                    
+                    tx = &np->tx->ring[requeue_idx++].req;
+                    
+                    tx->id   = i;
+                    tx->addr = virt_to_machine(skb->data);
+                    tx->size = skb->len;
+                    
+                    np->stats.tx_bytes += skb->len;
+                    np->stats.tx_packets++;
+                }
+            }
+            wmb();
+            np->tx->req_prod = requeue_idx;
+
+            /* Rebuild the RX buffer freelist and the RX ring itself. */
+            for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
+                if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
+                    np->rx->ring[requeue_idx++].req.id = i;
+            wmb();                
+            np->rx->req_prod = requeue_idx;
+
+            /* Step 3: All public and private state should now be sane.  Get
+             * ready to start sending and receiving packets and give the driver
+             * domain a kick because we've probably just requeued some
+             * packets.
+             */
+            netif_carrier_on(dev);
+            netif_start_queue(dev);
+            np->state = NETIF_STATE_ACTIVE;
+
+            notify_via_evtchn(status->evtchn);  
+
+	    network_tx_buf_gc(dev);
+
+            printk(KERN_INFO "Recovery completed\n");
+
+            spin_unlock(&np->tx_lock);
+            spin_unlock_irq(&np->rx_lock);
+        }
+
+        np->evtchn = status->evtchn;
+        np->irq = bind_evtchn_to_irq(np->evtchn);
+        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
+                          dev->name, dev);
+        break;
+
+    default:
+        printk(KERN_WARNING "Status change to unknown value %d\n", 
+               status->status);
+        break;
+    }
+}
+
+
+static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->subtype )
+    {
+    case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
+        if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
+            goto parse_error;
+        netif_status_change((netif_fe_interface_status_changed_t *)
+                            &msg->msg[0]);
+        break;
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+
+static int __init init_module(void)
+{
+    ctrl_msg_t                       cmsg;
+    netif_fe_driver_status_changed_t st;
+    int err;
+    struct net_device *dev;
+    struct net_private *np;
+
+    if ( start_info.flags & SIF_INITDOMAIN
+         || start_info.flags & SIF_NET_BE_DOMAIN )
+        return 0;
+
+    printk("Initialising Xen virtual ethernet frontend driver");
+
+    INIT_LIST_HEAD(&dev_list);
+
+    if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
+    {
+        err = -ENOMEM;
+        goto fail;
+    }
+
+    np = dev->priv;
+    np->state  = NETIF_STATE_CLOSED;
+    np->handle = 0;
+
+    dev->open            = network_open;
+    dev->hard_start_xmit = network_start_xmit;
+    dev->stop            = network_close;
+    dev->get_stats       = network_get_stats;
+    dev->poll            = netif_poll;
+    dev->weight          = 64;
+    
+    if ( (err = register_netdev(dev)) != 0 )
+    {
+        kfree(dev);
+        goto fail;
+    }
+    
+    np->dev = dev;
+    list_add(&np->list, &dev_list);
+
+    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_NETIF_FE;
+    cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
+    st.status      = NETIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+    
+    /*
+     * We should read 'nr_interfaces' from response message and wait
+     * for notifications before proceeding. For now we assume that we
+     * will be notified of exactly one interface.
+     */
+    while ( np->state != NETIF_STATE_CONNECTED )
+    {
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+    }
+
+    return 0;
+
+ fail:
+    cleanup_module();
+    return err;
+}
+
+
+static void cleanup_module(void)
+{
+    /* XXX FIXME */
+    BUG();
+}
+
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/netif.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/netif.h
new file mode 100644
index 0000000000..098b292612
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/netif.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * netif.h
+ * 
+ * Unified network-device I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __SHARED_NETIF_H__
+#define __SHARED_NETIF_H__
+
+typedef struct {
+    memory_t addr;   /*  0: Machine address of packet.  */
+    MEMORY_PADDING;
+    u16      id;     /*  8: Echoed in response message. */
+    u16      size;   /* 10: Packet size in bytes.       */
+} PACKED netif_tx_request_t; /* 12 bytes */
+
+typedef struct {
+    u16      id;     /*  0 */
+    s8       status; /*  2 */
+    u8       __pad;  /*  3 */
+} PACKED netif_tx_response_t; /* 4 bytes */
+
+typedef struct {
+    u16       id;    /*  0: Echoed in response message.        */
+} PACKED netif_rx_request_t; /* 2 bytes */
+
+typedef struct {
+    memory_t addr;   /*  0: Machine address of packet.              */
+    MEMORY_PADDING;
+    u16      id;     /*  8:  */
+    s16      status; /* 10: -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
+} PACKED netif_rx_response_t; /* 12 bytes */
+
+/*
+ * We use a special capitalised type name because it is _essential_ that all 
+ * arithmetic on indexes is done on an integer type of the correct size.
+ */
+typedef u32 NETIF_RING_IDX;
+
+/*
+ * Ring indexes are 'free running'. That is, they are not stored modulo the
+ * size of the ring buffer. The following macros convert a free-running counter
+ * into a value that can directly index a ring-buffer array.
+ */
+#define MASK_NETIF_RX_IDX(_i) ((_i)&(NETIF_RX_RING_SIZE-1))
+#define MASK_NETIF_TX_IDX(_i) ((_i)&(NETIF_TX_RING_SIZE-1))
+
+#define NETIF_TX_RING_SIZE 256
+#define NETIF_RX_RING_SIZE 256
+
+/* This structure must fit in a memory page. */
+typedef struct {
+    /*
+     * Frontend places packets into ring at tx_req_prod.
+     * Frontend receives event when tx_resp_prod passes tx_event.
+     */
+    NETIF_RING_IDX req_prod;       /*  0 */
+    NETIF_RING_IDX resp_prod;      /*  4 */
+    NETIF_RING_IDX event;          /*  8 */
+    union {                        /* 12 */
+        netif_tx_request_t  req;
+        netif_tx_response_t resp;
+    } PACKED ring[NETIF_TX_RING_SIZE];
+} PACKED netif_tx_interface_t;
+
+/* This structure must fit in a memory page. */
+typedef struct {
+    /*
+     * Frontend places empty buffers into ring at rx_req_prod.
+     * Frontend receives event when rx_resp_prod passes rx_event.
+     */
+    NETIF_RING_IDX req_prod;       /*  0 */
+    NETIF_RING_IDX resp_prod;      /*  4 */
+    NETIF_RING_IDX event;          /*  8 */
+    union {                        /* 12 */
+        netif_rx_request_t  req;
+        netif_rx_response_t resp;
+    } PACKED ring[NETIF_RX_RING_SIZE];
+} PACKED netif_rx_interface_t;
+
+/* Descriptor status values */
+#define NETIF_RSP_DROPPED         -2
+#define NETIF_RSP_ERROR           -1
+#define NETIF_RSP_OKAY             0
+
+#endif
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/network/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/Makefile
new file mode 100644
index 0000000000..2e4c1f4825
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/Makefile
@@ -0,0 +1,3 @@
+O_TARGET := drv.o
+obj-y := network.o
+include $(TOPDIR)/Rules.make
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/network/network.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/network.c
new file mode 100644
index 0000000000..e6c340685a
--- /dev/null
+++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/network.c
@@ -0,0 +1,656 @@
+/******************************************************************************
+ * network.c
+ * 
+ * Virtual network driver for XenoLinux.
+ * 
+ * Copyright (c) 2002-2003, K A Fraser
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
+
+static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs);
+static void network_tx_buf_gc(struct net_device *dev);
+static void network_alloc_rx_buffers(struct net_device *dev);
+static void cleanup_module(void);
+
+/* Dynamically-mapped IRQs. */
+static int network_irq, debug_irq;
+
+static struct list_head dev_list;
+
+struct net_private
+{
+    struct list_head list;
+    struct net_device *dev;
+
+    struct net_device_stats stats;
+    NET_RING_IDX rx_resp_cons, tx_resp_cons;
+    unsigned int net_ring_fixmap_idx, tx_full;
+    net_ring_t  *net_ring;
+    net_idx_t   *net_idx;
+    spinlock_t   tx_lock;
+    unsigned int idx; /* Domain-specific index of this VIF. */
+
+    unsigned int rx_bufs_to_notify;
+
+#define STATE_ACTIVE    0
+#define STATE_SUSPENDED 1
+#define STATE_CLOSED    2
+    unsigned int state;
+
+    /*
+     * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
+     * array is an index into a chain of free entries.
+     */
+    struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1];
+    struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1];
+};
+
+/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
+#define ADD_ID_TO_FREELIST(_list, _id)             \
+    (_list)[(_id)] = (_list)[0];                   \
+    (_list)[0]     = (void *)(unsigned long)(_id);
+#define GET_ID_FROM_FREELIST(_list)                \
+ ({ unsigned long _id = (unsigned long)(_list)[0]; \
+    (_list)[0]  = (_list)[_id];                    \
+    (unsigned short)_id; })
+
+
+static void _dbg_network_int(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+
+    if ( np->state == STATE_CLOSED )
+        return;
+    
+    printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x,"
+           " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x,"
+           " tx_event=0x%08x, state=%d\n",
+           np->tx_full, np->tx_resp_cons, 
+           np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, 
+           np->net_idx->tx_event,
+           test_bit(__LINK_STATE_XOFF, &dev->state));
+    printk(KERN_ALERT "net: rx_resp_cons=0x%08x,"
+           " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n",
+           np->rx_resp_cons, np->net_idx->rx_req_prod,
+           np->net_idx->rx_resp_prod, np->net_idx->rx_event);
+}
+
+
+static void dbg_network_int(int irq, void *unused, struct pt_regs *ptregs)
+{
+    struct list_head *ent;
+    struct net_private *np;
+    list_for_each ( ent, &dev_list )
+    {
+        np = list_entry(ent, struct net_private, list);
+        _dbg_network_int(np->dev);
+    }
+}
+
+
+static int network_open(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+    netop_t netop;
+    int i, ret;
+
+    netop.cmd = NETOP_RESET_RINGS;
+    netop.vif = np->idx;
+    if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 )
+    {
+        printk(KERN_ALERT "Possible net trouble: couldn't reset ring idxs\n");
+        return ret;
+    }
+
+    netop.cmd = NETOP_GET_VIF_INFO;
+    netop.vif = np->idx;
+    if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 )
+    {
+        printk(KERN_ALERT "Couldn't get info for vif %d\n", np->idx);
+        return ret;
+    }
+
+    memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN);
+
+    set_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx, 
+               netop.u.get_vif_info.ring_mfn << PAGE_SHIFT);
+    np->net_ring = (net_ring_t *)fix_to_virt(
+        FIX_NETRING0_BASE + np->net_ring_fixmap_idx);
+    np->net_idx  = &HYPERVISOR_shared_info->net_idx[np->idx];
+
+    np->rx_bufs_to_notify = 0;
+    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
+    memset(&np->stats, 0, sizeof(np->stats));
+    spin_lock_init(&np->tx_lock);
+    memset(np->net_ring, 0, sizeof(*np->net_ring));
+    memset(np->net_idx, 0, sizeof(*np->net_idx));
+
+    /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
+    for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ )
+        np->tx_skbs[i] = (void *)(i+1);
+    for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ )
+        np->rx_skbs[i] = (void *)(i+1);
+
+    wmb();
+    np->state = STATE_ACTIVE;
+
+    network_alloc_rx_buffers(dev);
+
+    netif_start_queue(dev);
+
+    MOD_INC_USE_COUNT;
+
+    return 0;
+}
+
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+    NET_RING_IDX i, prod;
+    unsigned short id;
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+    tx_entry_t *tx_ring = np->net_ring->tx_ring;
+
+    do {
+        prod = np->net_idx->tx_resp_prod;
+
+        for ( i = np->tx_resp_cons; i != prod; i++ )
+        {
+            id  = tx_ring[MASK_NET_TX_IDX(i)].resp.id;
+            skb = np->tx_skbs[id];
+            ADD_ID_TO_FREELIST(np->tx_skbs, id);
+            dev_kfree_skb_any(skb);
+        }
+        
+        np->tx_resp_cons = prod;
+        
+        /*
+         * Set a new event, then check for race with update of tx_cons. Note
+         * that it is essential to schedule a callback, no matter how few
+         * buffers are pending. Even if there is space in the transmit ring,
+         * higher layers may be blocked because too much data is outstanding:
+         * in such cases notification from Xen is likely to be the only kick
+         * that we'll get.
+         */
+        np->net_idx->tx_event = 
+            prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1;
+        mb();
+    }
+    while ( prod != np->net_idx->tx_resp_prod );
+
+    if ( np->tx_full && 
+         ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) )
+    {
+        np->tx_full = 0;
+        if ( np->state == STATE_ACTIVE )
+            netif_wake_queue(dev);
+    }
+}
+
+
+static inline pte_t *get_ppte(void *addr)
+{
+    pgd_t *pgd; pmd_t *pmd; pte_t *pte;
+    pgd = pgd_offset_k(   (unsigned long)addr);
+    pmd = pmd_offset(pgd, (unsigned long)addr);
+    pte = pte_offset(pmd, (unsigned long)addr);
+    return pte;
+}
+
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+    unsigned short id;
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+    netop_t netop;
+    NET_RING_IDX i = np->net_idx->rx_req_prod;
+
+    if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || 
+         unlikely(np->state != STATE_ACTIVE) )
+        return;
+
+    do {
+        skb = dev_alloc_skb(RX_BUF_SIZE);
+        if ( unlikely(skb == NULL) )
+            break;
+
+        skb->dev = dev;
+
+        if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
+            panic("alloc_skb needs to provide us page-aligned buffers.");
+
+        id = GET_ID_FROM_FREELIST(np->rx_skbs);
+        np->rx_skbs[id] = skb;
+
+        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id   = id;
+        np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = 
+            virt_to_machine(get_ppte(skb->head));
+
+	/* Shadow optimisation: disown this page from p->m map */
+	phys_to_machine_mapping[virt_to_phys(skb->head)>>PAGE_SHIFT] = 0x80000004;
+        np->rx_bufs_to_notify++;
+    }
+    while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE );
+
+    /*
+     * We may have allocated buffers which have entries outstanding in the page
+     * update queue -- make sure we flush those first!
+     */
+    flush_page_update_queue();
+
+    np->net_idx->rx_req_prod = i;
+    np->net_idx->rx_event    = np->rx_resp_cons + 1;
+        
+    /* Batch Xen notifications. */
+    if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) )
+    {
+        netop.cmd = NETOP_PUSH_BUFFERS;
+        netop.vif = np->idx;
+        (void)HYPERVISOR_net_io_op(&netop);
+        np->rx_bufs_to_notify = 0;
+    }
+}
+
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+    unsigned short id;
+    struct net_private *np = (struct net_private *)dev->priv;
+    tx_req_entry_t *tx;
+    netop_t netop;
+    NET_RING_IDX i;
+
+    if ( unlikely(np->tx_full) )
+    {
+        printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
+        netif_stop_queue(dev);
+        return -ENOBUFS;
+    }
+
+    if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
+                  PAGE_SIZE) )
+    {
+        struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
+        if ( unlikely(new_skb == NULL) )
+            return 1;
+        skb_put(new_skb, skb->len);
+        memcpy(new_skb->data, skb->data, skb->len);
+        dev_kfree_skb(skb);
+        skb = new_skb;
+    }   
+    
+    spin_lock_irq(&np->tx_lock);
+
+    i = np->net_idx->tx_req_prod;
+
+    id = GET_ID_FROM_FREELIST(np->tx_skbs);
+    np->tx_skbs[id] = skb;
+
+    tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req;
+
+    tx->id   = id;
+    tx->addr = phys_to_machine(virt_to_phys(skb->data));
+    tx->size = skb->len;
+
+    wmb();
+    np->net_idx->tx_req_prod = i + 1;
+
+    network_tx_buf_gc(dev);
+
+    if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) )
+    {
+        np->tx_full = 1;
+        netif_stop_queue(dev);
+    }
+
+    spin_unlock_irq(&np->tx_lock);
+
+    np->stats.tx_bytes += skb->len;
+    np->stats.tx_packets++;
+
+    /* Only notify Xen if there are no outstanding responses. */
+    mb();
+    if ( np->net_idx->tx_resp_prod == i )
+    {
+        netop.cmd = NETOP_PUSH_BUFFERS;
+        netop.vif = np->idx;
+        (void)HYPERVISOR_net_io_op(&netop);
+    }
+
+    return 0;
+}
+
+
+static inline void _network_interrupt(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+    unsigned long flags;
+    struct sk_buff *skb;
+    rx_resp_entry_t *rx;
+    NET_RING_IDX i;
+
+    if ( unlikely(np->state == STATE_CLOSED) )
+        return;
+    
+    spin_lock_irqsave(&np->tx_lock, flags);
+    network_tx_buf_gc(dev);
+    spin_unlock_irqrestore(&np->tx_lock, flags);
+
+ again:
+    for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ )
+    {
+        rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp;
+
+        skb = np->rx_skbs[rx->id];
+        ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
+
+/* XXXXX this is unsafe for live migrate -- if we do a scan be fore this
+point we won't transmit the right mfn! We have to fix this up in 
+xc_linux_save  */
+
+        phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
+            (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
+
+        if ( unlikely(rx->status != RING_STATUS_OK) )
+        {
+            /* Gate this error. We get a (valid) slew of them on suspend. */
+            if ( np->state == STATE_ACTIVE )
+            {
+	        /* With live migrate, we even get these...  Disable for now. */
+                // printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
+            }
+	    else
+		phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
+		    0x80000004; // disown this page -- it was a flush
+
+            dev_kfree_skb_any(skb);
+            continue;
+        }
+
+        /*
+         * Set up shinfo -- from alloc_skb This was particularily nasty:  the
+         * shared info is hidden at the back of the data area (presumably so it
+         * can be shared), but on page flip it gets very spunked.
+         */
+        atomic_set(&(skb_shinfo(skb)->dataref), 1);
+        skb_shinfo(skb)->nr_frags = 0;
+        skb_shinfo(skb)->frag_list = NULL;
+                                
+        skb->data = skb->tail = skb->head + rx->offset;
+        skb_put(skb, rx->size);
+        skb->protocol = eth_type_trans(skb, dev);
+
+        np->stats.rx_packets++;
+
+        np->stats.rx_bytes += rx->size;
+        netif_rx(skb);
+        dev->last_rx = jiffies;
+    }
+
+    np->rx_resp_cons = i;
+
+    network_alloc_rx_buffers(dev);
+    
+    /* Deal with hypervisor racing our resetting of rx_event. */
+    mb();
+    if ( np->net_idx->rx_resp_prod != i )
+        goto again;
+}
+
+
+static void network_interrupt(int irq, void *unused, struct pt_regs *ptregs)
+{
+    struct list_head *ent;
+    struct net_private *np;
+    list_for_each ( ent, &dev_list )
+    {
+        np = list_entry(ent, struct net_private, list);
+        _network_interrupt(np->dev);
+    }
+}
+
+
+static int network_close(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+    netop_t netop;
+
+    np->state = STATE_SUSPENDED;
+    wmb();
+
+    netif_stop_queue(np->dev);
+
+    netop.cmd = NETOP_FLUSH_BUFFERS;
+    netop.vif = np->idx;
+    (void)HYPERVISOR_net_io_op(&netop);
+
+    while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) ||
+            (np->tx_resp_cons != np->net_idx->tx_req_prod) )
+    {
+        barrier();
+        current->state = TASK_INTERRUPTIBLE;
+        schedule_timeout(1);
+    }
+
+    wmb();
+    np->state = STATE_CLOSED;
+    wmb();
+
+    /* Now no longer safe to take interrupts for this device. */
+    clear_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx);
+
+    MOD_DEC_USE_COUNT;
+
+    return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+    struct net_private *np = (struct net_private *)dev->priv;
+    return &np->stats;
+}
+
+
+/*
+ * This notifier is installed for domain 0 only.
+ * All other domains have VFR rules installed on their behalf by domain 0
+ * when they are created. For bootstrap, Xen creates wildcard rules for
+ * domain 0 -- this notifier is used to detect when we find our proper
+ * IP address, so we can poke down proper rules and remove the wildcards.
+ */
+static int inetdev_notify(struct notifier_block *this, 
+                          unsigned long event, 
+                          void *ptr)
+{
+    struct in_ifaddr  *ifa  = (struct in_ifaddr *)ptr; 
+    struct net_device *dev = ifa->ifa_dev->dev;
+    struct list_head  *ent;
+    struct net_private *np;
+    int idx = -1;
+    network_op_t op;
+
+    list_for_each ( ent, &dev_list )
+    {
+        np = list_entry(dev_list.next, struct net_private, list);
+        if ( np->dev == dev )
+            idx = np->idx;
+    }
+
+    if ( idx == -1 )
+        goto out;
+    
+    memset(&op, 0, sizeof(op));
+    op.u.net_rule.proto         = NETWORK_PROTO_ANY;
+    op.u.net_rule.action        = NETWORK_ACTION_ACCEPT;
+
+    if ( event == NETDEV_UP )
+        op.cmd = NETWORK_OP_ADDRULE;
+    else if ( event == NETDEV_DOWN )
+        op.cmd = NETWORK_OP_DELETERULE;
+    else
+        goto out;
+
+    op.u.net_rule.src_dom       = 0;
+    op.u.net_rule.src_idx       = idx;
+    op.u.net_rule.dst_dom       = VIF_SPECIAL;
+    op.u.net_rule.dst_idx       = VIF_PHYSICAL_INTERFACE;
+    op.u.net_rule.src_addr      = ntohl(ifa->ifa_address);
+    op.u.net_rule.src_addr_mask = ~0UL;
+    op.u.net_rule.dst_addr      = 0;
+    op.u.net_rule.dst_addr_mask = 0;
+    (void)HYPERVISOR_network_op(&op);
+    
+    op.u.net_rule.src_dom       = VIF_SPECIAL;
+    op.u.net_rule.src_idx       = VIF_ANY_INTERFACE;
+    op.u.net_rule.dst_dom       = 0;
+    op.u.net_rule.dst_idx       = idx;
+    op.u.net_rule.src_addr      = 0;
+    op.u.net_rule.src_addr_mask = 0;    
+    op.u.net_rule.dst_addr      = ntohl(ifa->ifa_address);
+    op.u.net_rule.dst_addr_mask = ~0UL;
+    (void)HYPERVISOR_network_op(&op);
+    
+ out:
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block notifier_inetdev = {
+    .notifier_call  = inetdev_notify,
+    .next           = NULL,
+    .priority       = 0
+};
+
+
+static int __init init_module(void)
+{
+    int i, fixmap_idx=-1, err;
+    struct net_device *dev;
+    struct net_private *np;
+    netop_t netop;
+
+    INIT_LIST_HEAD(&dev_list);
+
+    /*
+     * Domain 0 must poke its own network rules as it discovers its IP
+     * addresses. All other domains have a privileged "parent" to do this for
+     * them at start of day.
+     */
+    if ( start_info.flags & SIF_INITDOMAIN )
+        (void)register_inetaddr_notifier(&notifier_inetdev);
+
+    network_irq = bind_virq_to_irq(VIRQ_NET);
+    debug_irq   = bind_virq_to_irq(VIRQ_DEBUG);
+
+    err = request_irq(network_irq, network_interrupt, 
+                      SA_SAMPLE_RANDOM, "network", NULL);
+    if ( err )
+    {
+        printk(KERN_WARNING "Could not allocate network interrupt\n");
+        goto fail;
+    }
+    
+    err = request_irq(debug_irq, dbg_network_int, 
+                      SA_SHIRQ, "net_dbg", &dbg_network_int);
+    if ( err )
+        printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n");
+
+    for ( i = 0; i < MAX_DOMAIN_VIFS; i++ )
+    {
+        /* If the VIF is invalid then the query hypercall will fail. */
+        netop.cmd = NETOP_GET_VIF_INFO;
+        netop.vif = i;
+        if ( HYPERVISOR_net_io_op(&netop) != 0 )
+            continue;
+
+        /* We actually only support up to 4 vifs right now. */
+        if ( ++fixmap_idx == 4 )
+            break;
+
+        dev = alloc_etherdev(sizeof(struct net_private));
+        if ( dev == NULL )
+        {
+            err = -ENOMEM;
+            goto fail;
+        }
+
+        np = dev->priv;
+        np->state               = STATE_CLOSED;
+        np->net_ring_fixmap_idx = fixmap_idx;
+        np->idx                 = i;
+
+        SET_MODULE_OWNER(dev);
+        dev->open            = network_open;
+        dev->hard_start_xmit = network_start_xmit;
+        dev->stop            = network_close;
+        dev->get_stats       = network_get_stats;
+
+        memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN);
+
+        if ( (err = register_netdev(dev)) != 0 )
+        {
+            kfree(dev);
+            goto fail;
+        }
+
+        np->dev = dev;
+        list_add(&np->list, &dev_list);
+    }
+
+    return 0;
+
+ fail:
+    cleanup_module();
+    return err;
+}
+
+
+static void cleanup_module(void)
+{
+    struct net_private *np;
+    struct net_device *dev;
+
+    while ( !list_empty(&dev_list) )
+    {
+        np = list_entry(dev_list.next, struct net_private, list);
+        list_del(&np->list);
+        dev = np->dev;
+        unregister_netdev(dev);
+        kfree(dev);
+    }
+
+    if ( start_info.flags & SIF_INITDOMAIN )
+        (void)unregister_inetaddr_notifier(&notifier_inetdev);
+
+    free_irq(network_irq, NULL);
+    free_irq(debug_irq, NULL);
+
+    unbind_virq_from_irq(VIRQ_NET);
+    unbind_virq_from_irq(VIRQ_DEBUG);
+}
+
+
+module_init(init_module);
+module_exit(cleanup_module);