9 files changed, 1073 insertions, 562 deletions
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/defconfig b/xenolinux-2.4.16-sparse/arch/xeno/defconfig
index b278dec50f..c83d96d1b0 100644
--- a/xenolinux-2.4.16-sparse/arch/xeno/defconfig
+++ b/xenolinux-2.4.16-sparse/arch/xeno/defconfig
@@ -114,6 +114,7 @@ CONFIG_BLK_DEV_NBD=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=4096
 CONFIG_BLK_DEV_INITRD=y
+CONFIG_XENOLINUX_BLOCK=y
 
 #
 # Multi-device support (RAID and LVM)
@@ -379,13 +380,13 @@ CONFIG_MSDOS_PARTITION=y
 #
 # Kernel hacking
 #
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_HIGHMEM=y
-CONFIG_DEBUG_SLAB=y
-CONFIG_DEBUG_IOVIRT=y
+# CONFIG_DEBUG_KERNEL is not set
+# CONFIG_DEBUG_HIGHMEM is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_IOVIRT is not set
 # CONFIG_MAGIC_SYSRQ is not set
-CONFIG_DEBUG_SPINLOCK=y
-CONFIG_DEBUG_BUGVERBOSE=y
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_BUGVERBOSE is not set
 # CONFIG_KDB is not set
 CONFIG_KALLSYMS=y
-CONFIG_FRAME_POINTER=y
+# CONFIG_FRAME_POINTER is not set
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/Makefile b/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/Makefile
index 9361a01ec7..74a0c6c565 100644
--- a/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/Makefile
+++ b/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/Makefile
@@ -1,3 +1,3 @@
 O_TARGET := blk.o
-obj-y := block.o
+obj-y := xl_block.o xl_block_test.o
 include $(TOPDIR)/Rules.make
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/block.c b/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/block.c
deleted file mode 100644
index bf7d416dff..0000000000
--- a/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/block.c
+++ /dev/null
@@ -1,392 +0,0 @@
-/******************************************************************************
- * block.c
- * 
- * Virtual block driver for XenoLinux.
- * 
- * adapted from network.c
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-
-#include <asm/hypervisor-ifs/block.h>
-
-#ifdef UNDEFINED
-
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-
-#include <net/sock.h>
-
-#define BLK_TX_IRQ _EVENT_BLK_TX
-#define BLK_RX_IRQ _EVENT_BLK_RX
-
-#define TX_MAX_ENTRIES (TX_RING_SIZE - 2)
-#define RX_MAX_ENTRIES (RX_RING_SIZE - 2)
-
-#define TX_RING_INC(_i)    (((_i)+1) & (TX_RING_SIZE-1))
-#define RX_RING_INC(_i)    (((_i)+1) & (RX_RING_SIZE-1))
-#define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
-#define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
-
-#define RX_BUF_SIZE 1600 /* Ethernet MTU + plenty of slack! */
-
-
-
-int	    network_probe(struct net_device *dev);
-static int  network_open(struct net_device *dev);
-static int  network_start_xmit(struct sk_buff *skb, struct net_device *dev);
-static int  network_close(struct net_device *dev);
-static struct net_device_stats *network_get_stats(struct net_device *dev);
-static void network_rx_int(int irq, void *dev_id, struct pt_regs *ptregs);
-static void network_tx_int(int irq, void *dev_id, struct pt_regs *ptregs);
-static void network_tx_buf_gc(struct net_device *dev);
-static void network_alloc_rx_buffers(struct net_device *dev);
-static void network_free_rx_buffers(struct net_device *dev);
-
-static struct net_device dev_net_xeno;
-
-/*
- * RX RING:   RX_IDX <= rx_cons <= rx_prod
- * TX RING:   TX_IDX <= tx_cons <= tx_prod
- * (*_IDX allocated privately here, *_cons & *_prod shared with hypervisor)
- */
-struct net_private
-{
-    struct net_device_stats stats;
-    struct sk_buff **tx_skb_ring;
-    struct sk_buff **rx_skb_ring;
-    atomic_t tx_entries;
-    unsigned int rx_idx, tx_idx, tx_full;
-    net_ring_t *net_ring;
-    spinlock_t tx_lock;
-};
-
- 
-int __init network_probe(struct net_device *dev)
-{
-    SET_MODULE_OWNER(dev);
-
-    memcpy(dev->dev_addr, "\xFE\xFD\x00\x00\x00\x00", 6);
-
-    dev->open = network_open;
-    dev->hard_start_xmit = network_start_xmit;
-    dev->stop = network_close;
-    dev->get_stats = network_get_stats;
-
-    ether_setup(dev);
-    
-    return 0;
-}
-
-
-static int network_open(struct net_device *dev)
-{
-    struct net_private *np;
-    int error;
-
-    np = kmalloc(sizeof(struct net_private), GFP_KERNEL);
-    if ( np == NULL ) 
-    {
-        printk(KERN_WARNING "%s: No memory for private data\n", dev->name);
-        return -ENOMEM;
-    }
-    memset(np, 0, sizeof(struct net_private));
-    dev->priv = np;
-
-    spin_lock_init(&np->tx_lock);
-
-    atomic_set(&np->tx_entries, 0);
-
-    np->net_ring  = start_info.net_rings;
-    np->net_ring->tx_prod = np->net_ring->tx_cons = np->net_ring->tx_event = 0;
-    np->net_ring->rx_prod = np->net_ring->rx_cons = np->net_ring->rx_event = 0;
-    np->net_ring->tx_ring = NULL;
-    np->net_ring->rx_ring = NULL;
-
-    np->tx_skb_ring = kmalloc(TX_RING_SIZE * sizeof(struct sk_buff *),
-                              GFP_KERNEL);
-    np->rx_skb_ring = kmalloc(RX_RING_SIZE * sizeof(struct sk_buff *),
-                              GFP_KERNEL);
-    np->net_ring->tx_ring = kmalloc(TX_RING_SIZE * sizeof(tx_entry_t), 
-                                  GFP_KERNEL);
-    np->net_ring->rx_ring = kmalloc(RX_RING_SIZE * sizeof(rx_entry_t), 
-                                  GFP_KERNEL);
-    if ( (np->tx_skb_ring == NULL) || (np->rx_skb_ring == NULL) ||
-         (np->net_ring->tx_ring == NULL) || (np->net_ring->rx_ring == NULL) )
-    {
-        printk(KERN_WARNING "%s; Could not allocate ring memory\n", dev->name);
-        error = -ENOBUFS;
-        goto fail;
-    }
-
-    network_alloc_rx_buffers(dev);
-
-    error = request_irq(NET_RX_IRQ, network_rx_int, 0, "net-rx", dev);
-    if ( error )
-    {
-        printk(KERN_WARNING "%s: Could not allocate receive interrupt\n",
-               dev->name);
-        goto fail;
-    }
-
-    error = request_irq(NET_TX_IRQ, network_tx_int, 0, "net-tx", dev);
-    if ( error )
-    {
-        printk(KERN_WARNING "%s: Could not allocate transmit interrupt\n",
-               dev->name);
-        free_irq(NET_RX_IRQ, dev);
-        goto fail;
-    }
-
-    printk("XenoLinux Virtual Network Driver installed as %s\n", dev->name);
-
-    netif_start_queue(dev);
-
-    MOD_INC_USE_COUNT;
-
-    return 0;
-
- fail:
-    if ( np->net_ring->rx_ring ) kfree(np->net_ring->rx_ring);
-    if ( np->net_ring->tx_ring ) kfree(np->net_ring->tx_ring);
-    if ( np->rx_skb_ring ) kfree(np->rx_skb_ring);
-    if ( np->tx_skb_ring ) kfree(np->tx_skb_ring);
-    kfree(np);
-    return error;
-}
-
-
-static void network_tx_buf_gc(struct net_device *dev)
-{
-    unsigned int i;
-    struct net_private *np = dev->priv;
-    struct sk_buff *skb;
-    unsigned long flags;
-
-    spin_lock_irqsave(&np->tx_lock, flags);
-
-    for ( i = np->tx_idx; i != np->net_ring->tx_cons; i = TX_RING_INC(i) )
-    {
-        skb = np->tx_skb_ring[i];
-        dev_kfree_skb_any(skb);
-        atomic_dec(&np->tx_entries);
-    }
-
-    np->tx_idx = i;
-
-    if ( np->tx_full && (atomic_read(&np->tx_entries) < TX_MAX_ENTRIES) )
-    {
-        np->tx_full = 0;
-        netif_wake_queue(dev);
-    }
-
-    spin_unlock_irqrestore(&np->tx_lock, flags);
-}
-
-
-static void network_alloc_rx_buffers(struct net_device *dev)
-{
-    unsigned int i;
-    struct net_private *np = dev->priv;
-    struct sk_buff *skb;
-    unsigned int end = RX_RING_ADD(np->rx_idx, RX_MAX_ENTRIES);
-
-    for ( i = np->net_ring->rx_prod; i != end; i = RX_RING_INC(i) )
-    {
-        skb = dev_alloc_skb(RX_BUF_SIZE);
-        if ( skb == NULL ) break;
-        skb->dev = dev;
-        skb_reserve(skb, 2); /* word align the IP header */
-        np->rx_skb_ring[i] = skb;
-        np->net_ring->rx_ring[i].addr = (unsigned long)skb->data;
-        np->net_ring->rx_ring[i].size = RX_BUF_SIZE - 16; /* arbitrary */
-    }
-
-    np->net_ring->rx_prod = i;
-
-    np->net_ring->rx_event = RX_RING_INC(np->rx_idx);
-
-    HYPERVISOR_net_update();
-}
-
-
-static void network_free_rx_buffers(struct net_device *dev)
-{
-    unsigned int i;
-    struct net_private *np = dev->priv;
-    struct sk_buff *skb;    
-
-    for ( i = np->rx_idx; i != np->net_ring->rx_prod; i = RX_RING_INC(i) )
-    {
-        skb = np->rx_skb_ring[i];
-        dev_kfree_skb(skb);
-    }
-}
-
-
-static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-    unsigned int i;
-    struct net_private *np = (struct net_private *)dev->priv;
-
-    if ( np->tx_full )
-    {
-        printk(KERN_WARNING "%s: full queue wasn't stopped!\n", dev->name);
-        netif_stop_queue(dev);
-        return -ENOBUFS;
-    }
-
-    i = np->net_ring->tx_prod;
-    np->tx_skb_ring[i] = skb;
-    np->net_ring->tx_ring[i].addr = (unsigned long)skb->data;
-    np->net_ring->tx_ring[i].size = skb->len;
-    np->net_ring->tx_prod = TX_RING_INC(i);
-    atomic_inc(&np->tx_entries);
-
-    np->stats.tx_bytes += skb->len;
-    np->stats.tx_packets++;
-
-    spin_lock_irq(&np->tx_lock);
-    if ( atomic_read(&np->tx_entries) >= TX_MAX_ENTRIES )
-    {
-        np->tx_full = 1;
-        netif_stop_queue(dev);
-        np->net_ring->tx_event = TX_RING_ADD(np->tx_idx,
-                                           atomic_read(&np->tx_entries) >> 1);
-    }
-    else
-    {
-        /* Avoid unnecessary tx interrupts. */
-        np->net_ring->tx_event = TX_RING_INC(np->net_ring->tx_prod);
-    }
-    spin_unlock_irq(&np->tx_lock);
-
-    /* Must do this after setting tx_event: race with updates of tx_cons. */
-    network_tx_buf_gc(dev);
-
-    HYPERVISOR_net_update();
-
-    return 0;
-}
-
-
-static void network_rx_int(int irq, void *dev_id, struct pt_regs *ptregs)
-{
-    unsigned int i;
-    struct net_device *dev = (struct net_device *)dev_id;
-    struct net_private *np = dev->priv;
-    struct sk_buff *skb;
-    
- again:
-    for ( i = np->rx_idx; i != np->net_ring->rx_cons; i = RX_RING_INC(i) )
-    {
-        skb = np->rx_skb_ring[i];
-        skb_put(skb, np->net_ring->rx_ring[i].size);
-        skb->protocol = eth_type_trans(skb, dev);
-        np->stats.rx_packets++;
-        np->stats.rx_bytes += np->net_ring->rx_ring[i].size;
-        netif_rx(skb);
-        dev->last_rx = jiffies;
-    }
-
-    np->rx_idx = i;
-
-    network_alloc_rx_buffers(dev);
-    
-    /* Deal with hypervisor racing our resetting of rx_event. */
-    smp_mb();
-    if ( np->net_ring->rx_cons != i ) goto again;
-}
-
-
-static void network_tx_int(int irq, void *dev_id, struct pt_regs *ptregs)
-{
-    struct net_device *dev = (struct net_device *)dev_id;
-    network_tx_buf_gc(dev);
-}
-
-
-static int network_close(struct net_device *dev)
-{
-    struct net_private *np = dev->priv;
-
-    netif_stop_queue(dev);
-    free_irq(NET_RX_IRQ, dev);
-    free_irq(NET_TX_IRQ, dev);
-    network_free_rx_buffers(dev);
-    kfree(np->net_ring->rx_ring);
-    kfree(np->net_ring->tx_ring);
-    kfree(np->rx_skb_ring);
-    kfree(np->tx_skb_ring);
-    kfree(np);
-    MOD_DEC_USE_COUNT;
-    return 0;
-}
-
-
-static struct net_device_stats *network_get_stats(struct net_device *dev)
-{
-    struct net_private *np = (struct net_private *)dev->priv;
-    return &np->stats;
-}
-
-
-static int __init init_module(void)
-{
-    memset(&dev_net_xeno, 0, sizeof(dev_net_xeno));
-    strcpy(dev_net_xeno.name, "eth%d");
-    dev_net_xeno.init = network_probe;
-    return (register_netdev(&dev_net_xeno) != 0) ? -EIO : 0;
-}
-
-
-static void __exit cleanup_module(void)
-{
-    unregister_netdev(&dev_net_xeno);
-}
-
-#endif /* UNDEFINED */
-
-
-static void block_initialize(void)
-{
-  blk_ring_t *blk_ring = start_info.blk_ring;
-
-  if ( blk_ring == NULL ) return;
-
-  blk_ring->tx_prod = blk_ring->tx_cons = blk_ring->tx_event = 0;
-  blk_ring->rx_prod = blk_ring->rx_cons = blk_ring->rx_event = 0;
-  blk_ring->tx_ring = NULL;
-  blk_ring->rx_ring = NULL;
-}
-
-
-/*
- * block_setup initialized the xeno block device driver
- */
-
-static int __init init_module(void)
-{
-  block_initialize();
-  printk("XenoLinux Virtual Block Device Driver installed\n");
-  return 0;
-}
-
-static void __exit cleanup_module(void)
-{
-  printk("XenoLinux Virtual Block Device Driver uninstalled\n");
-}
-
-module_init(init_module);
-module_exit(cleanup_module);
-
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/xl_block.c b/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/xl_block.c
new file mode 100644
index 0000000000..f7bd088ff4
--- /dev/null
+++ b/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/xl_block.c
@@ -0,0 +1,595 @@
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <linux/fs.h>
+#include <linux/hdreg.h>                               /* HDIO_GETGEO, et al */
+#include <linux/blkdev.h>
+#include <linux/major.h>
+
+/* NOTE: this is drive independent, so no inclusion of ide.h */
+
+#include <asm/hypervisor-ifs/block.h>
+#include <asm/hypervisor-ifs/hypervisor-if.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>                                       /* put_user() */
+
+#define MAJOR_NR XLBLK_MAJOR   /* force defns in blk.h, must preceed include */
+static int xlblk_major = XLBLK_MAJOR;
+
+#include <linux/blk.h>           /* must come after definition of MAJOR_NR!! */
+
+/* instead of including linux/ide.h to pick up the definitiong of byte
+ * (and consequently screwing up blk.h, we'll just copy the definition */
+typedef unsigned char	byte; 
+
+void xlblk_ide_register_disk(int, unsigned long);
+
+#define XLBLK_MAX 2                                /* very arbitrary */
+#define XLBLK_MAJOR_NAME "xhd"
+#define IDE_PARTN_BITS 6                           /* from ide.h::PARTN_BITS */
+#define IDE_PARTN_MASK ((1<<IDE_PARTN_BITS)-1)     /* from ide.h::PARTN_MASK */
+static int xlblk_blk_size[XLBLK_MAX];
+static int xlblk_blksize_size[XLBLK_MAX];
+static int xlblk_read_ahead; 
+static int xlblk_hardsect_size[XLBLK_MAX];
+static int xlblk_max_sectors[XLBLK_MAX];
+
+#define XLBLK_RX_IRQ _EVENT_BLK_RX
+#define XLBLK_TX_IRQ _EVENT_BLK_TX
+
+#define DEBUG_IRQ    _EVENT_DEBUG 
+
+typedef struct xlblk_device
+{
+  struct buffer_head *bh;
+  unsigned int tx_count;                  /* number of used slots in tx ring */
+} xlblk_device_t;
+
+xlblk_device_t xlblk_device;
+
+#define XLBLK_DEBUG       0
+#define XLBLK_DEBUG_IOCTL 0
+
+/* 
+ * disk management
+ */
+
+xen_disk_info_t xen_disk_info;
+
+/* some declarations */
+void hypervisor_request(void *         id,
+			int            operation,
+			char *         buffer,
+			unsigned long  block_number,
+			unsigned short block_size,
+			kdev_t         device,
+			int            mode);
+
+
+/* ------------------------------------------------------------------------
+ */
+
+static int xenolinux_block_open(struct inode *inode, struct file *filep)
+{
+    if (XLBLK_DEBUG)
+	printk (KERN_ALERT "xenolinux_block_open\n"); 
+
+    return 0;
+}
+
+static int xenolinux_block_release(struct inode *inode, struct file *filep)
+{
+    if (XLBLK_DEBUG)
+	printk (KERN_ALERT "xenolinux_block_release\n");
+
+    return 0;
+}
+
+static int xenolinux_block_ioctl(struct inode *inode, struct file *filep,
+			  unsigned command, unsigned long argument)
+{
+    int minor_dev;
+    struct hd_geometry *geo = (struct hd_geometry *)argument;
+
+    if (XLBLK_DEBUG_IOCTL)
+	printk (KERN_ALERT "xenolinux_block_ioctl\n"); 
+
+    /* check permissions */
+    if (!capable(CAP_SYS_ADMIN)) return -EPERM;
+    if (!inode)                  return -EINVAL;
+    minor_dev = MINOR(inode->i_rdev);
+    if (minor_dev >= XLBLK_MAX)  return -ENODEV;
+    
+    if (XLBLK_DEBUG_IOCTL)
+	printk (KERN_ALERT "   command: 0x%x, argument: 0x%lx, minor: 0x%x\n",
+		command, (long) argument, minor_dev); 
+  
+    switch (command) {
+
+    case BLKGETSIZE:
+	if (XLBLK_DEBUG_IOCTL) 
+	    printk (KERN_ALERT
+		    "   BLKGETSIZE: %x %lx\n", BLKGETSIZE, 
+		    (long) xen_disk_info.disks[0].capacity); 
+	return put_user(xen_disk_info.disks[0].capacity, 
+			(unsigned long *) argument);
+
+    case BLKRRPART:
+	if (XLBLK_DEBUG_IOCTL)
+	    printk (KERN_ALERT "   BLKRRPART: %x\n", BLKRRPART); 
+	break;
+
+    case BLKSSZGET:
+	if (XLBLK_DEBUG_IOCTL)
+	    printk (KERN_ALERT "   BLKSSZGET: %x 0x%x\n", BLKSSZGET,
+		    xlblk_hardsect_size[minor_dev]);
+	return xlblk_hardsect_size[minor_dev]; 
+
+    case HDIO_GETGEO:
+
+	if (XLBLK_DEBUG_IOCTL)
+	    printk (KERN_ALERT "   HDIO_GETGEO: %x\n", HDIO_GETGEO);
+
+	if (!argument) return -EINVAL;
+	if (put_user(0x00,  (unsigned long *) &geo->start)) return -EFAULT;
+	if (put_user(0xff,  (byte *)&geo->heads)) return -EFAULT;
+	if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+	if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT;
+	return 0;
+
+    case HDIO_GETGEO_BIG: 
+
+	if (XLBLK_DEBUG_IOCTL) 
+	    printk (KERN_ALERT "   HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG);
+
+	if (!argument) return -EINVAL;
+	if (put_user(0x00,  (unsigned long *) &geo->start))  return -EFAULT;
+	if (put_user(0xff,  (byte *)&geo->heads))   return -EFAULT;
+	if (put_user(0x3f,  (byte *)&geo->sectors)) return -EFAULT;
+	if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT;
+
+	return 0;
+
+    default:
+	if (XLBLK_DEBUG_IOCTL) 
+	    printk (KERN_ALERT "   eh? unknown ioctl\n");
+	break;
+    }
+    
+    return 0;
+}
+
+static int xenolinux_block_check(kdev_t dev)
+{
+    if (XLBLK_DEBUG) 
+      printk (KERN_ALERT "xenolinux_block_check\n");
+    return 0;
+}
+
+static int xenolinux_block_revalidate(kdev_t dev)
+{
+    if (XLBLK_DEBUG) 
+	printk (KERN_ALERT "xenolinux_block_revalidate\n"); 
+    return 0;
+}
+
+/*
+ * hypervisor_request
+ *
+ * request block io 
+ * 
+ * id: for guest use only.
+ * operation: XEN_BLOCK_READ, XEN_BLOCK_WRITE or XEN_BLOCK_PROBE
+ * buffer: buffer to read/write into. this should be a
+ *   virtual address in the guest os.
+ * block_number:  block to read
+ * block_size:  size of each block
+ * device:  ide/hda is 768 or 0x300
+ * mode: XEN_BLOCK_SYNC or XEN_BLOCK_ASYNC.  async requests
+ *   will queue until a sync request is issued.
+ */
+
+void hypervisor_request(void *         id,
+			int            operation,
+			char *         buffer,
+			unsigned long  block_number,
+			unsigned short block_size,
+			kdev_t         device,
+			int            mode)
+{
+    blk_ring_t *blk_ring = start_info.blk_ring;
+    int position;
+    void *buffer_pa, *buffer_ma; 
+    kdev_t phys_device = (kdev_t) 0;
+    unsigned long sector_number = 0;
+    struct gendisk *gd; 
+    
+
+    buffer_pa = (void *)virt_to_phys(buffer); 
+    buffer_ma = (void *)phys_to_machine((unsigned long)buffer_pa); 
+
+    if (operation == XEN_BLOCK_PROBE) {
+	phys_device = (kdev_t) 0;
+	sector_number = 0;
+
+    } else if (operation == XEN_BLOCK_READ || operation == XEN_BLOCK_WRITE) {
+
+	/*
+	 * map logial major device to the physical device number 
+	 *
+	 *           XLBLK_MAJOR -> IDE0_MAJOR  (123 -> 3)
+	 */
+	if (MAJOR(device) == XLBLK_MAJOR) 
+	    phys_device = MKDEV(IDE0_MAJOR, 0);
+	else {
+	    printk (KERN_ALERT "error: xl_block::hypervisor_request: "
+		    "unknown device [0x%x]\n", device);
+	    BUG();
+	}
+
+	/*
+	 * compute real buffer location on disk
+	 * (from ll_rw_block.c::submit_bh)
+	 */
+
+
+	sector_number = block_number /* * block_size >> 9 */;
+
+	if((gd = (struct gendisk *)xen_disk_info.disks[0].gendisk) != NULL)
+	    sector_number += gd->part[MINOR(device)&IDE_PARTN_MASK].start_sect;
+    }
+
+
+    if (BLK_TX_RING_INC(blk_ring->btx_prod) == blk_ring->btx_cons) {
+	printk (KERN_ALERT "hypervisor_request: btx_cons: %d, btx_prod:%d",
+		blk_ring->btx_cons, blk_ring->btx_prod);
+	BUG(); 
+    }
+    
+    /* Fill out a communications ring structure & trap to the hypervisor */
+    position = blk_ring->btx_prod;
+    blk_ring->btx_ring[position].id            = id;
+    blk_ring->btx_ring[position].priority      = mode;
+    blk_ring->btx_ring[position].operation     = operation;
+    blk_ring->btx_ring[position].buffer        = buffer_ma;
+    blk_ring->btx_ring[position].block_number  = block_number;
+    blk_ring->btx_ring[position].block_size    = block_size;
+    blk_ring->btx_ring[position].device        = phys_device;
+    blk_ring->btx_ring[position].sector_number = sector_number;
+
+    blk_ring->btx_prod = BLK_TX_RING_INC(blk_ring->btx_prod);
+
+    switch(mode) { 
+
+    case XEN_BLOCK_SYNC:  
+	/* trap into hypervisor */
+	HYPERVISOR_block_io_op();
+	break; 
+
+    case XEN_BLOCK_ASYNC:
+	/* for now, do nothing.  the request will go in the ring and
+	   the next sync request will trigger the hypervisor to act */
+	printk("Oh dear-- ASYNC xen block of doom!\n"); 
+	break; 
+
+    default: 
+	/* ummm, unknown mode. */
+	printk("xl_block thingy: unknown mode %d\n", mode); 
+	BUG();
+    }
+
+    return;
+}
+
+
+/*
+ * do_xlblk_request
+ *
+ * read a block; request is in a request queue
+ *
+ * TO DO: should probably release the io_request_lock and then re-acquire
+ *        (see LDD p. 338)
+ */
+static void do_xlblk_request (request_queue_t *rq)
+{
+    struct request *req;
+    
+    if (XLBLK_DEBUG)
+	printk (KERN_ALERT "xlblk.c::do_xlblk_request for '%s'\n", 
+		DEVICE_NAME); 
+    
+    while (!QUEUE_EMPTY)
+    {
+	struct buffer_head *bh;
+	unsigned long offset;
+	unsigned long length;
+	int rw;
+	
+	if(rq->plugged) 
+	    return ; 
+	
+	req = CURRENT;
+	
+	if (XLBLK_DEBUG) 
+	    printk (KERN_ALERT
+		    "do_xlblk_request %p: cmd %i, sec %lx, (%li) bh:%p\n",
+		    req, req->cmd, req->sector,
+		    req->current_nr_sectors, req->bh);
+	
+	/* is there space in the tx ring for this request?
+	 * if the ring is full, then leave the request in the queue
+	 *
+	 * THIS IS A BIT BOGUS SINCE XEN COULD BE UPDATING BTX_CONS
+	 * AT THE SAME TIME
+	 */
+	{
+	    blk_ring_t *blk_ring = start_info.blk_ring;
+	    
+	    if (BLK_RX_RING_INC(blk_ring->btx_prod) == blk_ring->btx_cons)
+	    {
+		printk (KERN_ALERT "OOPS, TX LOOKS FULL  cons: %d  prod: %d\n",
+			blk_ring->btx_cons, blk_ring->btx_prod);
+		BUG(); 
+		break;
+	    }
+	}
+	
+	req->errors = 0;
+	blkdev_dequeue_request(req);
+	
+	bh = req->bh;
+	
+	while (bh)
+	{
+	    offset = bh->b_rsector << 9;
+	    length = bh->b_size;
+	    
+	    rw = req->cmd;
+	    if (rw == READA)  rw= READ;
+	    if ((rw != READ) && (rw != WRITE)) {
+		printk (KERN_ALERT
+			"XenoLinux Virtual Block Device: bad cmd: %d\n", rw);
+		BUG();
+	    }
+
+	    hypervisor_request (req, rw == READ ? 
+				XEN_BLOCK_READ : XEN_BLOCK_WRITE, 
+				bh->b_data, bh->b_rsector, bh->b_size, 
+				bh->b_dev, XEN_BLOCK_SYNC);
+	    bh = bh->b_reqnext;
+	}
+
+	blkdev_dequeue_request(req);
+
+    }
+
+    return;
+}
+
+
+static struct block_device_operations xenolinux_block_fops = 
+{
+    open:               xenolinux_block_open,
+    release:            xenolinux_block_release,
+    ioctl:              xenolinux_block_ioctl,
+    check_media_change: xenolinux_block_check,
+    revalidate:         xenolinux_block_revalidate,
+};
+
+static void xlblk_rx_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    blk_ring_t *blk_ring = start_info.blk_ring;
+    struct request *req;
+    int loop;
+    u_long flags; 
+    
+    for (loop = blk_ring->brx_cons;
+	 loop != blk_ring->brx_prod;
+	 loop = BLK_RX_RING_INC(loop)) {
+
+	blk_ring_entry_t *bret = &blk_ring->brx_ring[loop];
+	
+	if(bret->operation == XEN_BLOCK_PROBE)
+	    continue; 
+
+	spin_lock_irqsave(&io_request_lock, flags);
+	req = (struct request *)bret->id;
+	    
+	if (!end_that_request_first(req, 1, "XenBlk"))
+	    end_that_request_last(req);
+	spin_unlock_irqrestore(&io_request_lock, flags);
+	
+    }
+    
+    blk_ring->brx_cons = loop;
+}
+
+static void xlblk_tx_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    if (XLBLK_DEBUG) 
+	printk (KERN_ALERT "--- xlblock::xlblk_tx_int\n"); 
+}
+
+int __init xlblk_init(void)
+{
+    blk_ring_t *blk_ring = start_info.blk_ring;
+    int loop, error, result;
+
+    /* initialize memory rings to communicate with hypervisor */
+    if ( blk_ring == NULL ) return -ENOMEM;
+
+    blk_ring->btx_prod = blk_ring->btx_cons = 0;
+    blk_ring->brx_prod = blk_ring->brx_cons = 0;
+    blk_ring->btx_ring = NULL;
+    blk_ring->brx_ring = NULL;
+    
+    blk_ring->btx_ring = kmalloc(BLK_TX_RING_SIZE * sizeof(blk_ring_entry_t),
+				 GFP_KERNEL);
+    blk_ring->brx_ring = kmalloc(BLK_RX_RING_SIZE * sizeof(blk_ring_entry_t),
+				 GFP_KERNEL);
+
+    if ((blk_ring->btx_ring == NULL) || (blk_ring->brx_ring == NULL)) {
+	printk (KERN_ALERT "could not alloc ring memory for block device\n");
+	error = -ENOBUFS;
+	goto fail;
+    }
+    
+    error = request_irq(XLBLK_RX_IRQ, xlblk_rx_int, 0, 
+			"xlblk-rx", &xlblk_device);
+    if (error) {
+	printk(KERN_ALERT "Could not allocate receive interrupt\n");
+	goto fail;
+    }
+
+    error = request_irq(XLBLK_TX_IRQ, xlblk_tx_int, 0, 
+			"xlblk-tx", &xlblk_device);
+    if (error) {
+	printk(KERN_ALERT "Could not allocate transmit interrupt\n");
+	free_irq(XLBLK_RX_IRQ, &xlblk_device);
+	goto fail;
+    }
+
+    memset (&xen_disk_info, 0, sizeof(xen_disk_info));
+    xen_disk_info.count = 0;
+
+    hypervisor_request(NULL, XEN_BLOCK_PROBE, (char *) &xen_disk_info,
+		       0, 0, (kdev_t) 0, XEN_BLOCK_SYNC);
+    for (loop = 0; loop < xen_disk_info.count; loop++) 
+	printk (KERN_ALERT "  %2d: type: %d, capacity: %ld\n",
+		loop, xen_disk_info.disks[loop].type, 
+		xen_disk_info.disks[loop].capacity);
+
+    
+    SET_MODULE_OWNER(&xenolinux_block_fops);
+    result = register_blkdev(xlblk_major, "block", &xenolinux_block_fops);
+    if (result < 0) {
+	printk (KERN_ALERT "xenolinux block: can't get major %d\n",
+		xlblk_major);
+	return result;
+    }
+
+    /* initialize global arrays in drivers/block/ll_rw_block.c */
+    for (loop = 0; loop < XLBLK_MAX; loop++) {
+	xlblk_blk_size[loop]      = xen_disk_info.disks[0].capacity;
+	xlblk_blksize_size[loop]  = 512;
+	xlblk_hardsect_size[loop] = 512;
+	xlblk_max_sectors[loop]   = 128;
+    }
+    xlblk_read_ahead  = 8; 
+
+    blk_size[xlblk_major]      = xlblk_blk_size;
+    blksize_size[xlblk_major]  = xlblk_blksize_size;
+    hardsect_size[xlblk_major] = xlblk_hardsect_size;
+    read_ahead[xlblk_major]    = xlblk_read_ahead; 
+    max_sectors[xlblk_major]   = xlblk_max_sectors;
+
+    blk_init_queue(BLK_DEFAULT_QUEUE(xlblk_major), do_xlblk_request);
+    /* 
+    ** XXX SMH: we don't leave req on queue => are happy for evelator
+    ** to reorder things including it. (main reason for this decision
+    ** is that it works while 'standard' case doesn't. Ho hum). 
+    */
+    blk_queue_headactive(BLK_DEFAULT_QUEUE(xlblk_major), 0);
+
+    xlblk_ide_register_disk(0, xen_disk_info.disks[0].capacity);
+
+    printk(KERN_ALERT 
+	   "XenoLinux Virtual Block Device Driver installed [device: %d]\n",
+	   xlblk_major);
+    return 0;
+
+ fail:
+    if (blk_ring->btx_ring) kfree(blk_ring->btx_ring);
+    if (blk_ring->brx_ring) kfree(blk_ring->brx_ring);
+    return error;
+}
+
+void xlblk_ide_register_disk(int idx, unsigned long capacity)
+{
+    int units;
+    int minors;
+    struct gendisk *gd;
+
+    /* plagarized from ide-probe.c::init_gendisk */
+    
+    units = 2; /* from ide.h::MAX_DRIVES */
+
+    minors    = units * (1<<IDE_PARTN_BITS);
+    gd        = kmalloc (sizeof(struct gendisk), GFP_KERNEL);
+    gd->sizes = kmalloc (minors * sizeof(int), GFP_KERNEL);
+    gd->part  = kmalloc (minors * sizeof(struct hd_struct), GFP_KERNEL);
+    memset(gd->part, 0, minors * sizeof(struct hd_struct));
+    
+    gd->major        = xlblk_major;  
+    gd->major_name   = XLBLK_MAJOR_NAME;
+    gd->minor_shift  = IDE_PARTN_BITS; 
+    gd->max_p	     = 1<<IDE_PARTN_BITS;
+    gd->nr_real	     = units;           
+    gd->real_devices = NULL;          
+    gd->next	     = NULL;            
+    gd->fops         = &xenolinux_block_fops;
+    gd->de_arr       = kmalloc (sizeof *gd->de_arr * units, GFP_KERNEL);
+    gd->flags	     = kmalloc (sizeof *gd->flags * units, GFP_KERNEL);
+
+    if (gd->de_arr)  
+	memset (gd->de_arr, 0, sizeof *gd->de_arr * units);
+
+    if (gd->flags) 
+	memset (gd->flags, 0, sizeof *gd->flags * units);
+
+    add_gendisk(gd);
+
+    xen_disk_info.disks[idx].gendisk = gd;
+
+    /* default disk size is just a big number.  in the future, we
+       need a message to probe the devices to determine the actual size */
+    register_disk(gd, MKDEV(xlblk_major, 0), 1<<IDE_PARTN_BITS,
+		  &xenolinux_block_fops, capacity);
+
+    return;
+}
+
+
+
+static void __exit xlblk_cleanup(void)
+{
+    /* CHANGE FOR MULTIQUEUE */
+    blk_cleanup_queue(BLK_DEFAULT_QUEUE(xlblk_major));
+
+    /* clean up global arrays */
+    read_ahead[xlblk_major] = 0;
+
+    if (blk_size[xlblk_major]) 
+	kfree(blk_size[xlblk_major]);
+    blk_size[xlblk_major] = NULL;
+
+    if (blksize_size[xlblk_major]) 
+	kfree(blksize_size[xlblk_major]);
+    blksize_size[xlblk_major] = NULL;
+
+    if (hardsect_size[xlblk_major]) 
+	kfree(hardsect_size[xlblk_major]);
+    hardsect_size[xlblk_major] = NULL;
+    
+    /* XXX: free each gendisk */
+    if (unregister_blkdev(xlblk_major, "block"))
+	printk(KERN_ALERT
+	       "XenoLinux Virtual Block Device Driver uninstalled w/ errs\n");
+    else
+	printk(KERN_ALERT 
+	       "XenoLinux Virtual Block Device Driver uninstalled\n");
+
+    return;
+}
+
+
+#ifdef MODULE
+module_init(xlblk_init);
+module_exit(xlblk_cleanup);
+#endif
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/xl_block_test.c b/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/xl_block_test.c
new file mode 100644
index 0000000000..cab6d9a330
--- /dev/null
+++ b/xenolinux-2.4.16-sparse/arch/xeno/drivers/block/xl_block_test.c
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * xenolinux_block_test.c
+ * 
+ */
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+
+#include <asm/hypervisor-ifs/block.h>
+#include <asm/hypervisor-ifs/hypervisor-if.h>
+
+/******************************************************************/
+
+static struct proc_dir_entry *bdt;
+static blk_ring_entry_t meta;
+static char * data;
+
+static int proc_read_bdt(char *page, char **start, off_t off,
+			 int count, int *eof, void *data)
+{
+  switch (meta.operation)
+  {
+    case XEN_BLOCK_READ :
+    case XEN_BLOCK_WRITE :
+    {
+      return proc_dump_block(page, start, off, count, eof, data);
+    }
+    case XEN_BLOCK_DEBUG :
+    {
+      return proc_dump_debug(page, start, off, count, eof, data);
+    }
+    default :
+    {
+      printk(KERN_ALERT 
+	     "block device test error: unknown operation [%c]\n",
+	     meta.operation);
+      return -EINVAL;
+    }
+  }
+}
+
+int proc_dump_debug(char *page, char **start, off_t off,
+		    int count, int *eof, void *data)
+{
+  char header[100];
+  char dump[1024];
+
+  sprintf (header, "Block Device Test: Debug Dump\n\n");
+  
+  sprintf (dump, "%s\n", meta.buffer);
+  
+  if (data)
+  {
+    kfree(data);
+  }
+
+  strncpy (page, dump, count);
+  return strlen(page);
+}
+
+int proc_dump_block(char *page, char **start, off_t off,
+		    int count, int *eof, void *data)
+{
+  char header[100];
+  char dump[1024];
+  char temp[100];
+  int loop;
+
+  sprintf (header, "Block Device Test\n\n%s  blk num: %ld 0x%lx;  size: %d 0x%x;  device: 0x%x\n",
+	   meta.operation == XEN_BLOCK_WRITE ? "write" : "read",
+	   meta.block_number, meta.block_number,
+	   meta.block_size, meta.block_size,
+	   meta.device);
+  
+  sprintf (dump, "%s", header);
+
+  if (meta.buffer)
+  {
+    for (loop = 0; loop < 100; loop++)
+    {
+      int i = meta.buffer[loop];
+    
+      if (loop % 8 == 0)
+      {
+	sprintf (temp, "[%2d] ", loop);
+	strcat(dump, temp);
+      }
+      else if (loop % 2 == 0)
+      {
+	strcat(dump, " ");
+      }
+
+      sprintf (temp, " 0x%02x", i & 255);
+      strcat(dump, temp);
+      if ((loop + 1) % 8 == 0)
+      {
+	strcat(dump, "\n");
+      }
+    }
+    strcat(dump, "\n\n");
+  }
+  
+  if (data)
+  {
+    kfree(data);
+  }
+
+  strncpy (page, dump, count);
+  return strlen(page);
+}
+
+int proc_write_bdt(struct file *file, const char *buffer,
+		   unsigned long count, void *data)
+{
+  char *local = kmalloc((count + 1) * sizeof(char), GFP_KERNEL);
+  char  opcode;
+  int  block_number = 0;
+  int  block_size = 0;
+  int  device = 0;
+  int  mode;
+
+  if (copy_from_user(local, buffer, count))
+  {
+    return -EFAULT;
+  }
+  local[count] = '\0';
+
+  sscanf(local, "%c %i %i %i", 
+	 &opcode, &block_number, &block_size, &device);
+
+  if (opcode == 'r' || opcode == 'R')
+  {
+    meta.operation = XEN_BLOCK_READ;
+  }
+  else if (opcode == 'w' || opcode == 'W')
+  {
+    meta.operation = XEN_BLOCK_WRITE;
+  }
+  else if (opcode == 'd' || opcode == 'D')
+  {
+    meta.operation = XEN_BLOCK_DEBUG;
+    block_size = 10000;
+  }
+  else
+  {
+    printk(KERN_ALERT 
+	   "block device test error: unknown opcode [%c]\n", opcode);
+    return -EINVAL;
+  }
+
+  if (opcode == 'r' || opcode == 'w' ||
+      opcode == 'd' || opcode == 'D')
+  {
+    mode = XEN_BLOCK_SYNC;
+  }
+  else /* (opcode == 'R' || opcode == 'W') */
+  {
+    mode = XEN_BLOCK_ASYNC;
+  }
+
+  if (data)
+  {
+    kfree(data);
+  }
+  data = kmalloc(block_size * sizeof(char), GFP_KERNEL);
+  if (data == NULL)
+  {
+    kfree(local);
+    return -ENOMEM;
+  }
+
+  meta.block_number = block_number;
+  meta.block_size   = block_size;
+  meta.device       = device;
+  meta.buffer       = data;
+
+  /* submit request */
+  hypervisor_request(0, meta.operation, meta.buffer, 
+		     meta.block_number, meta.block_size,
+		     meta.device, mode);
+
+  kfree(local);
+  return count;
+}
+			 
+
+static int __init init_module(void)
+{
+  int return_value = 0;
+
+  /* create proc entry */
+  bdt = create_proc_entry("bdt", 0644, NULL);
+  if (bdt == NULL)
+  {
+    return_value = -ENOMEM;
+    goto error;
+  }
+  bdt->data       = NULL;
+  bdt->read_proc  = proc_read_bdt;
+  bdt->write_proc = proc_write_bdt;
+  bdt->owner      = THIS_MODULE;
+
+  memset(&meta, 0, sizeof(meta));
+  
+  /* success */
+  printk(KERN_ALERT "XenoLinux Block Device Test installed\n");
+  return 0;
+
+ error:
+  return return_value;
+}
+
+static void __exit cleanup_module(void)
+{
+  if (data)
+  {
+    kfree(data);
+  }
+  printk(KERN_ALERT "XenoLinux Block Device Test uninstalled\n");
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/drivers/dom0/dom0_block.c b/xenolinux-2.4.16-sparse/arch/xeno/drivers/dom0/dom0_block.c
new file mode 100644
index 0000000000..97d4a65b78
--- /dev/null
+++ b/xenolinux-2.4.16-sparse/arch/xeno/drivers/dom0/dom0_block.c
@@ -0,0 +1,27 @@
+/*
+ * domain 0 block driver interface
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+static int __init init_module(void)
+{
+  request_module("xl_block");
+  printk("Successfully installed domain 0 block interface\n");
+
+
+  return 0;
+}
+
+static void __exit cleanup_module(void)
+{
+  printk("Successfully de-installed domain-0 block interface\n");
+  return 0;
+}
+
+module_init(init_module);
+module_exit(cleanup_module);
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/kernel/i386_ksyms.c b/xenolinux-2.4.16-sparse/arch/xeno/kernel/i386_ksyms.c
index 12db77164b..a35ef1cc8a 100644
--- a/xenolinux-2.4.16-sparse/arch/xeno/kernel/i386_ksyms.c
+++ b/xenolinux-2.4.16-sparse/arch/xeno/kernel/i386_ksyms.c
@@ -42,7 +42,7 @@ extern struct drive_info_struct drive_info;
 EXPORT_SYMBOL(drive_info);
 #endif
 
-extern unsigned long get_cmos_time(void);
+//extern unsigned long get_cmos_time(void);
 
 /* platform dependent support */
 EXPORT_SYMBOL(boot_cpu_data);
@@ -58,7 +58,7 @@ EXPORT_SYMBOL(probe_irq_mask);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(pm_idle);
 EXPORT_SYMBOL(pm_power_off);
-EXPORT_SYMBOL(get_cmos_time);
+//EXPORT_SYMBOL(get_cmos_time);
 EXPORT_SYMBOL(apm_info);
 
 #ifdef CONFIG_DEBUG_IOVIRT
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/kernel/process.c b/xenolinux-2.4.16-sparse/arch/xeno/kernel/process.c
index 1c7f27176d..87c52056f6 100644
--- a/xenolinux-2.4.16-sparse/arch/xeno/kernel/process.c
+++ b/xenolinux-2.4.16-sparse/arch/xeno/kernel/process.c
@@ -86,7 +86,7 @@ void cpu_idle (void)
 
     while (1) {
         while (!current->need_resched)
-            HYPERVISOR_yield();
+            HYPERVISOR_do_sched_op(NULL);
         schedule();
         check_pgt_cache();
     }
diff --git a/xenolinux-2.4.16-sparse/arch/xeno/kernel/time.c b/xenolinux-2.4.16-sparse/arch/xeno/kernel/time.c
index c728eb15e6..4999af6642 100644
--- a/xenolinux-2.4.16-sparse/arch/xeno/kernel/time.c
+++ b/xenolinux-2.4.16-sparse/arch/xeno/kernel/time.c
@@ -1,3 +1,25 @@
+/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
+ ****************************************************************************
+ * (C) 2002 - Rolf Neugebauer - Intel Research Cambridge
+ ****************************************************************************
+ *
+ *        File: arch.xeno/time.c
+ *      Author: Rolf Neugebauer
+ *     Changes: 
+ *              
+ *        Date: Nov 2002
+ * 
+ * Environment: XenoLinux
+ * Description: Interface with Hypervisor to get correct notion of time
+ *              Currently supports Systemtime and WallClock time.
+ *
+ * (This has hardly any resemblence with the Linux code but left the
+ *  copyright notice anyway. Ignore the comments in the copyright notice.)
+ ****************************************************************************
+ * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
+ ****************************************************************************
+ */
+
 /*
  *  linux/arch/i386/kernel/time.c
  *
@@ -30,19 +52,6 @@
  *	serialize accesses to xtime/lost_ticks).
  */
 
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/irq.h>
 #include <asm/msr.h>
@@ -51,115 +60,103 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 
-#include <linux/mc146818rtc.h>
-#include <linux/timex.h>
-#include <linux/config.h>
-
+#include <asm/div64.h>
 #include <asm/hypervisor.h>
 
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/smp.h>
 #include <linux/irq.h>
 
-
-unsigned long cpu_khz;	/* Detected as we calibrate the TSC */
-
-/* Cached *multiplier* to convert TSC counts to microseconds.
- * (see the equation below).
- * Equal to 2^32 * (1 / (clocks per usec) ).
- * Initialized in time_init.
- */
-unsigned long fast_gettimeoffset_quotient;
-
-extern rwlock_t xtime_lock;
-extern unsigned long wall_jiffies;
+#undef XENO_TIME_DEBUG	/* adds sanity checks and periodic printouts */
 
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+extern rwlock_t xtime_lock;
 
-static inline unsigned long ticks_to_secs(unsigned long long ticks)
-{
-    unsigned long lo, hi;
-    unsigned long little_ticks;
-
-    little_ticks = ticks /* XXX URK! XXX / 1000000ULL */;
-
-    __asm__ __volatile__ (
-        "mull %2"
-        : "=a" (lo), "=d" (hi)
-        : "rm" (fast_gettimeoffset_quotient), "0" (little_ticks) );
+unsigned long cpu_khz;	/* get this from Xen, used elsewhere */
+static spinlock_t hyp_stime_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t hyp_wctime_lock = SPIN_LOCK_UNLOCKED;
 
-    return(hi);
-}
+static u32 st_scale_f;
+static u32 st_scale_i;
+static u32 shadow_st_pcc;
+static s64 shadow_st;
 
-/* NB. Only 32 bits of ticks are considered here. */
-static inline unsigned long ticks_to_us(unsigned long ticks)
+/*
+ * System time.
+ * Although the rest of the Linux kernel doesn't know about this, we
+ * we use it to extrapolate passage of wallclock time.
+ * We need to read the values from the shared info page "atomically" 
+ * and use the cycle counter value as the "version" number. Clashes
+ * should be very rare.
+ */
+static inline long long get_s_time(void)
 {
-    unsigned long lo, hi;
+	unsigned long flags;
+    u32           delta_tsc, low, pcc;
+	u64           delta;
+	s64           now;
 
-    __asm__ __volatile__ (
-        "mull %2"
-        : "=a" (lo), "=d" (hi)
-        : "rm" (fast_gettimeoffset_quotient), "0" (ticks) );
+	spin_lock_irqsave(&hyp_stime_lock, flags);
 
-    return(hi);
-}
+	while ((pcc = HYPERVISOR_shared_info->st_timestamp) != shadow_st_pcc)
+	{
+		barrier();
+		shadow_st_pcc = pcc;
+		shadow_st     = HYPERVISOR_shared_info->system_time;
+		barrier();
+	}
 
-static inline unsigned long do_gettimeoffset(void)
-{
-#if 0
-    register unsigned long eax, edx;
+    now = shadow_st;
+    /* only use bottom 32bits of TSC. This should be sufficient */
+	rdtscl(low);
+    delta_tsc = low - pcc;
+	delta = ((u64)delta_tsc * st_scale_f);
+	delta >>= 32;
+	delta += ((u64)delta_tsc * st_scale_i);
 
-    /* Read the Time Stamp Counter */
+	spin_unlock_irqrestore(&hyp_time_lock, flags);
 
-    rdtsc(eax,edx);
+    return now + delta; 
 
-    /* .. relative to previous jiffy (32 bits is enough) */
-    eax -= last_tsc_low;	/* tsc_low delta */
-
-    /*
-     * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
-     *             = (tsc_low delta) * (usecs_per_clock)
-     *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
-     *
-     * Using a mull instead of a divl saves up to 31 clock cycles
-     * in the critical path.
-     */
-    
-    edx = ticks_to_us(eax);
-
-    /* our adjusted time offset in microseconds */
-    return delay_at_last_interrupt + edx;
-#else
-    /*
-     * We should keep a 'last_tsc_low' thing which incorporates 
-     * delay_at_last_interrupt, adjusted in timer_interrupt after
-     * do_timer_interrupt. It would look at change in xtime, and
-     * make appropriate adjustment to a last_tsc variable.
-     * 
-     * We'd be affected by rounding error in ticks_per_usec, and by
-     * processor clock drift (which should be no more than in an
-     * external interrupt source anyhow). 
-     * 
-     * Perhaps a bit rough and ready, but never mind!
-     */
-    return 0;
-#endif
 }
+#define NOW()				((long long)get_s_time())
 
 /*
- * This version of gettimeofday has microsecond resolution
- * and better than microsecond precision on fast x86 machines with TSC.
+ * Wallclock time.
+ * Based on what the hypervisor tells us, extrapolated using system time.
+ * Again need to read a number of values from the shared page "atomically".
+ * this time using a version number.
  */
+static u32        shadow_wc_version=0;
+static long       shadow_tv_sec;
+static long       shadow_tv_usec;
+static long long  shadow_wc_timestamp;
 void do_gettimeofday(struct timeval *tv)
 {
-    unsigned long flags;
-    unsigned long usec, sec, lost;
-
-    read_lock_irqsave(&xtime_lock, flags);
-    usec = do_gettimeoffset();
-    lost = jiffies - wall_jiffies;
-    if ( lost != 0 ) usec += lost * (1000000 / HZ);
-    sec = xtime.tv_sec;
-    usec += xtime.tv_usec;
-    read_unlock_irqrestore(&xtime_lock, flags);
+	unsigned long flags;
+    long          usec, sec;
+	u32	          version;
+	u64           now;
+
+	spin_lock_irqsave(&hyp_wctime_lock, flags);
+
+	while ((version = HYPERVISOR_shared_info->wc_version)!= shadow_wc_version)
+	{
+		barrier();
+		shadow_wc_version   = version;
+		shadow_tv_sec       = HYPERVISOR_shared_info->tv_sec;
+		shadow_tv_usec      = HYPERVISOR_shared_info->tv_usec;
+		shadow_wc_timestamp = HYPERVISOR_shared_info->wc_timestamp;
+		barrier();
+	}
+
+	now   = NOW();
+	usec  = ((unsigned long)(now-shadow_wc_timestamp))/1000;
+	sec   = shadow_tv_sec;
+	usec += shadow_tv_usec;
 
     while ( usec >= 1000000 ) 
     {
@@ -169,10 +166,40 @@ void do_gettimeofday(struct timeval *tv)
 
     tv->tv_sec = sec;
     tv->tv_usec = usec;
+
+	spin_unlock_irqrestore(&hyp_time_lock, flags);
+
+#ifdef XENO_TIME_DEBUG
+	{
+		static long long old_now=0;
+		static long long wct=0, old_wct=0;
+
+		/* This debug code checks if time increase over two subsequent calls */
+		wct=(((long long)sec) * 1000000) + usec;
+		/* wall clock time going backwards */
+		if ((wct < old_wct) ) {	
+			printk("Urgh1: wc diff=%6ld, usec = %ld (0x%lX)\n",
+				   (long)(wct-old_wct), usec, usec);		
+			printk("       st diff=%lld cur st=0x%016llX old st=0x%016llX\n",
+				   now-old_now, now, old_now);
+		}
+
+		/* system time going backwards */
+		if (now<=old_now) {
+			printk("Urgh2: st diff=%lld cur st=0x%016llX old st=0x%016llX\n",
+				   now-old_now, now, old_now);
+		}
+		old_wct  = wct;
+		old_now  = now;
+	}
+#endif
+
 }
 
 void do_settimeofday(struct timeval *tv)
 {
+/* XXX RN: should do something special here for dom0 */
+#if 0
     write_lock_irq(&xtime_lock);
     /*
      * This is revolting. We need to set "xtime" correctly. However, the
@@ -195,29 +222,73 @@ void do_settimeofday(struct timeval *tv)
     time_maxerror = NTP_PHASE_LIMIT;
     time_esterror = NTP_PHASE_LIMIT;
     write_unlock_irq(&xtime_lock);
+#endif
 }
 
 
 /*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * Timer ISR. 
+ * Unlike normal Linux these don't come in at a fixed rate of HZ. 
+ * In here we wrok out how often it should have been called and then call
+ * the architecture independent part (do_timer()) the appropriate number of
+ * times. A bit of a nasty hack, to keep the "other" notion of wallclock time
+ * happy.
  */
-static inline void do_timer_interrupt(
-    int irq, void *dev_id, struct pt_regs *regs)
+static long long us_per_tick=1000000/HZ;
+static long long last_irq;
+static inline void do_timer_interrupt(int irq, void *dev_id,
+									  struct pt_regs *regs)
 {
-    do_timer(regs);
+	struct timeval tv;
+	long long time, delta;
+
+#ifdef XENO_TIME_DEBUG
+	static u32 foo_count = 0;
+	foo_count++;		
+	if (foo_count>= 10000) {
+		s64 n = NOW();
+		struct timeval tv;
+		do_gettimeofday(&tv);
+		printk("0x%08X%08X %ld:%ld\n",
+			   (u32)(n>>32), (u32)n, tv.tv_sec, tv.tv_usec);
+		foo_count = 0;
+	}
+#endif
+
+    /*
+     * The next bit really sucks:
+     * Linux not only uses do_gettimeofday() to keep a notion of
+     * wallclock time, but also maintains the xtime struct and jiffies.
+     * (Even worse some userland code accesses this via the sys_time()
+     * system call)
+     * Unfortunately, xtime is maintain in the architecture independent
+     * part of the timer ISR (./kernel/timer.c sic!). So, although we have
+     * perfectly valid notion of wallclock time from the hypervisor we here
+     * fake missed timer interrupts so that the arch independent part of
+     * the Timer ISR updates jiffies for us *and* once the bh gets run
+     * updates xtime accordingly. Yuck!
+     */
+
+	/* work out the number of jiffies past and update them */
+	do_gettimeofday(&tv);
+	time = (((long long)tv.tv_sec) * 1000000) + tv.tv_usec;
+	delta = time - last_irq;
+	if (delta <= 0) {
+		printk ("Timer ISR: Time went backwards: %lld\n", delta);
+		return;
+	}
+	while (delta >= us_per_tick) {
+		do_timer(regs);
+		delta    -= us_per_tick;
+		last_irq += us_per_tick;
+	}
+
 #if 0
     if (!user_mode(regs))
         x86_do_profile(regs->eip);
 #endif
 }
 
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
 static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
     write_lock(&xtime_lock);
@@ -234,56 +305,31 @@ static struct irqaction irq_timer = {
     NULL
 };
 
-
-unsigned long get_cmos_time(void)
-{
-    unsigned long secs = HYPERVISOR_shared_info->rtc_time;
-    unsigned long diff;
-
-    rdtscl(diff);
-    diff -= (unsigned long)HYPERVISOR_shared_info->rtc_timestamp;
-
-    secs += ticks_to_us(diff);
-
-    return(secs + ticks_to_secs(diff));
-}
-
-
-/* Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). */
-static unsigned long __init calibrate_tsc(void)
+void __init time_init(void)
 {
-    unsigned long quo, rem;
+    unsigned long long alarm;
+	u64	cpu_freq = HYPERVISOR_shared_info->cpu_freq;
+	u64 scale;
 
-    /* quotient == (1000 * 2^32) / ticks_per ms */
-    __asm__ __volatile__ (
-        "divl %2"
-        : "=a" (quo), "=d" (rem)
-        : "r" (HYPERVISOR_shared_info->ticks_per_ms), "0" (0), "1" (1000) );
+	do_get_fast_time = do_gettimeofday;
 
-    return(quo);
-}
+	cpu_khz = (u32)cpu_freq/1000;
+	printk("Xen reported: %lu.%03lu MHz processor.\n", 
+		   cpu_khz / 1000, cpu_khz % 1000);
 
-void __init time_init(void)
-{
-    unsigned long long alarm;
-	
-    fast_gettimeoffset_quotient = calibrate_tsc();
-    do_get_fast_time = do_gettimeofday;
+	/*
+     * calculate systemtime scaling factor
+	 * XXX RN: have to cast cpu_freq to u32 limits it to 4.29 GHz. 
+	 *     Get a better do_div!
+	 */
+	scale = 1000000000LL << 32;
+	do_div(scale,(u32)cpu_freq);
+	st_scale_f = scale & 0xffffffff;
+	st_scale_i = scale >> 32;
+	printk("System Time scale: %X %X\n",st_scale_i, st_scale_f);
 
-    /* report CPU clock rate in Hz.
-     * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-     * clock/second. Our precision is about 100 ppm.
-     */
-    {	
-        unsigned long eax=0, edx=1000;
-        __asm__ __volatile__
-            ("divl %2"
-             :"=a" (cpu_khz), "=d" (edx)
-             :"r" (fast_gettimeoffset_quotient),
-             "0" (eax), "1" (edx));
-        printk("Detected %lu.%03lu MHz processor.\n", 
-               cpu_khz / 1000, cpu_khz % 1000);
-    }
+	do_gettimeofday(&xtime);
+	last_irq = (((long long)xtime.tv_sec) * 1000000) + xtime.tv_usec;
 
     setup_irq(TIMER_IRQ, &irq_timer);
 
@@ -292,13 +338,14 @@ void __init time_init(void)
      * 'domain' time. This means that clock sshould run at the correct
      * rate. For things like scheduling, it's not clear whether it
      * matters which sort of time we use.
+	 * XXX RN: unimplemented.
      */
+
     rdtscll(alarm);
+#if 0
     alarm += (1000/HZ)*HYPERVISOR_shared_info->ticks_per_ms;
     HYPERVISOR_shared_info->wall_timeout   = alarm;
     HYPERVISOR_shared_info->domain_timeout = ~0ULL;
+#endif
     clear_bit(_EVENT_TIMER, &HYPERVISOR_shared_info->events);
-
-    xtime.tv_sec = get_cmos_time();
-    xtime.tv_usec = 0;
 }