aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkaf24@labyrinth.cl.cam.ac.uk <kaf24@labyrinth.cl.cam.ac.uk>2003-03-09 20:51:18 +0000
committerkaf24@labyrinth.cl.cam.ac.uk <kaf24@labyrinth.cl.cam.ac.uk>2003-03-09 20:51:18 +0000
commit6607af3a4627f1e17459cd798379136763c0c86a (patch)
tree1ba73c518ec177d4b07f3a352647052dcf30902b
parent34ba072b45d0fc781147b9299787cdb9161650c4 (diff)
downloadxen-6607af3a4627f1e17459cd798379136763c0c86a.tar.gz
xen-6607af3a4627f1e17459cd798379136763c0c86a.tar.bz2
xen-6607af3a4627f1e17459cd798379136763c0c86a.zip
bitkeeper revision 1.115 (3e6ba94627SF_Dv66Al7guNkgaK_xg)
Many files: Add scatter/gather to the Xen blkdev interface. Our write speeds are now comparable with Linux. Also fixed a few bugs.
-rw-r--r--xen/common/dom_mem_ops.c8
-rw-r--r--xen/common/domain.c1
-rw-r--r--xen/common/memory.c2
-rw-r--r--xen/drivers/block/ll_rw_blk.c2
-rw-r--r--xen/drivers/block/xen_block.c529
-rw-r--r--xen/drivers/block/xen_segment.c93
-rw-r--r--xen/drivers/ide/ide-dma.c22
-rw-r--r--xen/include/hypervisor-ifs/block.h51
-rw-r--r--xen/include/xeno/blkdev.h33
-rw-r--r--xen/include/xeno/sched.h5
-rw-r--r--xen/include/xeno/segment.h19
-rw-r--r--xen/net/dev.c35
-rw-r--r--xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c146
-rw-r--r--xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h2
-rw-r--r--xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c2
-rw-r--r--xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c2
16 files changed, 590 insertions, 362 deletions
diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c
index 08a47652d3..e919311c5c 100644
--- a/xen/common/dom_mem_ops.c
+++ b/xen/common/dom_mem_ops.c
@@ -37,6 +37,7 @@ static long alloc_dom_mem(struct task_struct *p, balloon_def_op_t bop)
return -ENOMEM;
spin_lock_irqsave(&free_list_lock, flags);
+ spin_lock(&p->page_lock);
temp = free_list.next;
for ( i = 0; i < bop.size; i++ )
@@ -63,6 +64,7 @@ static long alloc_dom_mem(struct task_struct *p, balloon_def_op_t bop)
unmap_domain_mem(va);
}
+ spin_unlock(&p->page_lock);
spin_unlock_irqrestore(&free_list_lock, flags);
return bop.size;
@@ -78,7 +80,8 @@ static long free_dom_mem(struct task_struct *p, balloon_inf_op_t bop)
long rc = 0;
spin_lock_irqsave(&free_list_lock, flags);
-
+ spin_lock(&p->page_lock);
+
temp = free_list.next;
for ( i = 0; i < bop.size; i++ )
{
@@ -94,7 +97,7 @@ static long free_dom_mem(struct task_struct *p, balloon_inf_op_t bop)
pf = &frame_table[mpfn];
if ( (pf->type_count != 0) ||
- (pf->type_count != 0) ||
+ (pf->tot_count != 0) ||
(pf->flags != p->domain) )
{
DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n",
@@ -113,6 +116,7 @@ static long free_dom_mem(struct task_struct *p, balloon_inf_op_t bop)
}
out:
+ spin_unlock(&p->page_lock);
spin_unlock_irqrestore(&free_list_lock, flags);
return rc ? rc : bop.size;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 7ec09a8bb8..32bf8b7172 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -46,6 +46,7 @@ struct task_struct *do_newdomain(unsigned int dom_id, unsigned int cpu)
p->processor = cpu;
spin_lock_init(&p->blk_ring_lock);
+ spin_lock_init(&p->page_lock);
p->shared_info = (void *)get_free_page(GFP_KERNEL);
memset(p->shared_info, 0, PAGE_SIZE);
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 4a0304aaf8..4b0848ea9d 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -726,6 +726,7 @@ int do_process_page_updates(page_update_request_t *ureqs, int count)
err = 1;
/* Least significant bits of 'ptr' demux the operation type. */
+ spin_lock_irq(&current->page_lock);
switch ( req.ptr & (sizeof(l1_pgentry_t)-1) )
{
/*
@@ -799,6 +800,7 @@ int do_process_page_updates(page_update_request_t *ureqs, int count)
MEM_LOG("Invalid page update command %08lx", req.ptr);
break;
}
+ spin_unlock_irq(&current->page_lock);
if ( err )
{
diff --git a/xen/drivers/block/ll_rw_blk.c b/xen/drivers/block/ll_rw_blk.c
index 615b332c4b..870b5cdb85 100644
--- a/xen/drivers/block/ll_rw_blk.c
+++ b/xen/drivers/block/ll_rw_blk.c
@@ -1224,7 +1224,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
continue;
/* We have the buffer lock */
- atomic_inc(&bh->b_count);
+ /*atomic_inc(&bh->b_count);*/
switch(rw) {
case WRITE:
diff --git a/xen/drivers/block/xen_block.c b/xen/drivers/block/xen_block.c
index 81b4f4e75a..b72bb0640c 100644
--- a/xen/drivers/block/xen_block.c
+++ b/xen/drivers/block/xen_block.c
@@ -18,7 +18,7 @@
#include <xeno/interrupt.h>
#include <xeno/segment.h>
-#if 0
+#if 1
#define DPRINTK(_f, _a...) printk( _f , ## _a )
#else
#define DPRINTK(_f, _a...) ((void)0)
@@ -28,12 +28,30 @@
* These are rather arbitrary. They are fairly large because adjacent
* requests pulled from a communication ring are quite likely to end
* up being part of the same scatter/gather request at the disc.
- * It might be a good idea to add scatter/gather support explicitly to
- * the scatter/gather ring (eg. each request has an array of N pointers);
- * then these values would better reflect real costs at the disc.
+ *
+ * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * This will increase the chances of being able to write whole tracks.
+ * '64' should be enough to keep us competitive with Linux.
*/
-#define MAX_PENDING_REQS 32
-#define BATCH_PER_DOMAIN 8
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+
+/*
+ * Each outstanding request which we've passed to the lower device layers
+ * has a 'pending_req' allocated to it. Each buffer_head that completes
+ * decrements the pendcnt towards zero. When it hits zero, the specified
+ * domain has a response queued for it, with the saved 'id' passed back.
+ *
+ * We can't allocate pending_req's in order, since they may complete out
+ * of order. We therefore maintain an allocation ring. This ring also
+ * indicates when enough work has been passed down -- at that point the
+ * allocation ring will be empty.
+ */
+static pending_req_t pending_reqs[MAX_PENDING_REQS];
+static unsigned char pending_ring[MAX_PENDING_REQS];
+static unsigned int pending_prod, pending_cons;
+static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
+#define PENDREQ_IDX_INC(_i) ((_i) = ((_i)+1) & (MAX_PENDING_REQS-1))
static kmem_cache_t *buffer_head_cachep;
static atomic_t nr_pending;
@@ -65,6 +83,18 @@ static kdev_t scsi_devs[NR_SCSI_DEVS] = {
MKDEV(SCSI_DISK0_MAJOR, 224), MKDEV(SCSI_DISK0_MAJOR, 240), /* sdo, sdp */
};
+static int __buffer_is_valid(struct task_struct *p,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer);
+static void __lock_buffer(unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer);
+static void unlock_buffer(struct task_struct *p,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer);
+
static void io_schedule(unsigned long unused);
static int do_block_io_op_domain(struct task_struct *p, int max_to_do);
static void dispatch_rw_block_io(struct task_struct *p, int index);
@@ -73,8 +103,8 @@ static void dispatch_probe_seg(struct task_struct *p, int index);
static void dispatch_debug_block_io(struct task_struct *p, int index);
static void dispatch_create_segment(struct task_struct *p, int index);
static void dispatch_delete_segment(struct task_struct *p, int index);
-static void make_response(struct task_struct *p, void *id, int op,
- unsigned long st);
+static void make_response(struct task_struct *p, unsigned long id,
+ unsigned short op, unsigned long st);
/******************************************************************
@@ -165,28 +195,27 @@ static void maybe_trigger_io_schedule(void)
static void end_block_io_op(struct buffer_head *bh, int uptodate)
{
- struct pfn_info *page;
- unsigned long pfn;
+ unsigned long flags;
+ pending_req_t *pending_req = bh->pending_req;
- for ( pfn = virt_to_phys(bh->b_data) >> PAGE_SHIFT;
- pfn < ((virt_to_phys(bh->b_data) + bh->b_size + PAGE_SIZE - 1) >>
- PAGE_SHIFT);
- pfn++ )
+ unlock_buffer(pending_req->domain,
+ virt_to_phys(bh->b_data),
+ bh->b_size,
+ (pending_req->operation==READ));
+
+ if ( atomic_dec_and_test(&pending_req->pendcnt) )
{
- page = frame_table + pfn;
- if ( ((bh->b_state & (1 << BH_Read)) != 0) &&
- (put_page_type(page) == 0) )
- page->flags &= ~PG_type_mask;
- put_page_tot(page);
+ make_response(pending_req->domain, pending_req->id,
+ pending_req->operation, uptodate ? 0 : 1);
+ spin_lock_irqsave(&pend_prod_lock, flags);
+ pending_ring[pending_prod] = pending_req - pending_reqs;
+ PENDREQ_IDX_INC(pending_prod);
+ spin_unlock_irqrestore(&pend_prod_lock, flags);
+ atomic_dec(&nr_pending);
+ maybe_trigger_io_schedule();
}
- atomic_dec(&nr_pending);
- make_response(bh->b_xen_domain, bh->b_xen_id,
- XEN_BLOCK_READ, uptodate ? 0 : 1);
-
kmem_cache_free(buffer_head_cachep, bh);
-
- maybe_trigger_io_schedule();
}
@@ -208,16 +237,105 @@ long do_block_io_op(void)
* DOWNWARD CALLS -- These interface with the block-device layer proper.
*/
-static int do_block_io_op_domain(struct task_struct* p, int max_to_do)
+static int __buffer_is_valid(struct task_struct *p,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer)
+{
+ unsigned long pfn;
+ struct pfn_info *page;
+ int rc = 0;
+
+ /* A request may span multiple page frames. Each must be checked. */
+ for ( pfn = buffer >> PAGE_SHIFT;
+ pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ pfn++ )
+ {
+ /* Each frame must be within bounds of machine memory. */
+ if ( pfn >= max_page )
+ {
+ DPRINTK("pfn out of range: %08lx\n", pfn);
+ goto out;
+ }
+
+ page = frame_table + pfn;
+
+ /* Each frame must belong to the requesting domain. */
+ if ( (page->flags & PG_domain_mask) != p->domain )
+ {
+ DPRINTK("bad domain: expected %d, got %ld\n",
+ p->domain, page->flags & PG_domain_mask);
+ goto out;
+ }
+
+ /* If reading into the frame, the frame must be writeable. */
+ if ( writeable_buffer &&
+ ((page->flags & PG_type_mask) != PGT_writeable_page) )
+ {
+ DPRINTK("non-writeable page passed for block read\n");
+ goto out;
+ }
+ }
+
+ rc = 1;
+ out:
+ return rc;
+}
+
+static void __lock_buffer(unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer)
+{
+ unsigned long pfn;
+ struct pfn_info *page;
+
+ for ( pfn = buffer >> PAGE_SHIFT;
+ pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ pfn++ )
+ {
+ page = frame_table + pfn;
+ if ( writeable_buffer ) get_page_type(page);
+ get_page_tot(page);
+ }
+}
+
+static void unlock_buffer(struct task_struct *p,
+ unsigned long buffer,
+ unsigned short size,
+ int writeable_buffer)
+{
+ unsigned long pfn, flags;
+ struct pfn_info *page;
+
+ spin_lock_irqsave(&p->page_lock, flags);
+ for ( pfn = buffer >> PAGE_SHIFT;
+ pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ pfn++ )
+ {
+ page = frame_table + pfn;
+ if ( writeable_buffer && (put_page_type(page) == 0) )
+ page->flags &= ~PG_type_mask;
+ put_page_tot(page);
+ }
+ spin_unlock_irqrestore(&p->page_lock, flags);
+}
+
+static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
{
blk_ring_t *blk_ring = p->blk_ring_base;
int i, more_to_do = 0;
+ /*
+ * Take items off the comms ring, taking care not to catch up
+ * with the response-producer index.
+ */
for ( i = p->blk_req_cons;
- i != blk_ring->req_prod;
+ (i != blk_ring->req_prod) &&
+ (((p->blk_resp_prod-i) & (BLK_RING_SIZE-1)) != 1);
i = BLK_RING_INC(i) )
{
- if ( max_to_do-- == 0 )
+ if ( (max_to_do-- == 0) ||
+ (atomic_read(&nr_pending) == MAX_PENDING_REQS) )
{
more_to_do = 1;
break;
@@ -251,8 +369,11 @@ static int do_block_io_op_domain(struct task_struct* p, int max_to_do)
break;
default:
- panic("error: unknown block io operation [%d]\n",
- blk_ring->ring[i].req.operation);
+ DPRINTK("error: unknown block io operation [%d]\n",
+ blk_ring->ring[i].req.operation);
+ make_response(p, blk_ring->ring[i].req.id,
+ blk_ring->ring[i].req.operation, 1);
+ break;
}
}
@@ -268,23 +389,38 @@ static void dispatch_debug_block_io(struct task_struct *p, int index)
static void dispatch_create_segment(struct task_struct *p, int index)
{
blk_ring_t *blk_ring = p->blk_ring_base;
+ unsigned long flags, buffer;
xv_disk_t *xvd;
int result;
- if (p->domain != 0)
+ if ( p->domain != 0 )
{
DPRINTK("dispatch_create_segment called by dom%d\n", p->domain);
- make_response(p, blk_ring->ring[index].req.id,
- XEN_BLOCK_SEG_CREATE, 1);
- return;
+ result = 1;
+ goto out;
+ }
+
+ buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
+
+ spin_lock_irqsave(&p->page_lock, flags);
+ if ( !__buffer_is_valid(p, buffer, sizeof(xv_disk_t), 1) )
+ {
+ DPRINTK("Bad buffer in dispatch_create_segment\n");
+ spin_unlock_irqrestore(&p->page_lock, flags);
+ result = 1;
+ goto out;
}
+ __lock_buffer(buffer, sizeof(xv_disk_t), 1);
+ spin_unlock_irqrestore(&p->page_lock, flags);
- xvd = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer);
+ xvd = phys_to_virt(buffer);
result = xen_segment_create(xvd);
+ unlock_buffer(p, buffer, sizeof(xv_disk_t), 1);
+
+ out:
make_response(p, blk_ring->ring[index].req.id,
XEN_BLOCK_SEG_CREATE, result);
- return;
}
static void dispatch_delete_segment(struct task_struct *p, int index)
@@ -299,13 +435,30 @@ static void dispatch_probe_blk(struct task_struct *p, int index)
blk_ring_t *blk_ring = p->blk_ring_base;
xen_disk_info_t *xdi;
+ unsigned long flags, buffer;
+ int rc = 0;
+
+ buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
- xdi = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer);
+ spin_lock_irqsave(&p->page_lock, flags);
+ if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) )
+ {
+ DPRINTK("Bad buffer in dispatch_probe_blk\n");
+ spin_unlock_irqrestore(&p->page_lock, flags);
+ rc = 1;
+ goto out;
+ }
+ __lock_buffer(buffer, sizeof(xen_disk_info_t), 1);
+ spin_unlock_irqrestore(&p->page_lock, flags);
+ xdi = phys_to_virt(buffer);
ide_probe_devices(xdi);
scsi_probe_devices(xdi);
- make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_BLK, 0);
+ unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1);
+
+ out:
+ make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_BLK, rc);
}
static void dispatch_probe_seg(struct task_struct *p, int index)
@@ -313,175 +466,147 @@ static void dispatch_probe_seg(struct task_struct *p, int index)
extern void xen_segment_probe(xen_disk_info_t *xdi);
blk_ring_t *blk_ring = p->blk_ring_base;
xen_disk_info_t *xdi;
+ unsigned long flags, buffer;
+ int rc = 0;
+
+ buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF;
+
+ spin_lock_irqsave(&p->page_lock, flags);
+ if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) )
+ {
+ DPRINTK("Bad buffer in dispatch_probe_seg\n");
+ spin_unlock_irqrestore(&p->page_lock, flags);
+ rc = 1;
+ goto out;
+ }
+ __lock_buffer(buffer, sizeof(xen_disk_info_t), 1);
+ spin_unlock_irqrestore(&p->page_lock, flags);
- xdi = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer);
+ xdi = phys_to_virt(buffer);
xen_segment_probe(xdi);
- make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG, 0);
+ unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1);
+
+ out:
+ make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG, rc);
}
static void dispatch_rw_block_io(struct task_struct *p, int index)
{
extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
blk_ring_t *blk_ring = p->blk_ring_base;
+ blk_ring_req_entry_t *req = &blk_ring->ring[index].req;
struct buffer_head *bh;
- int operation;
- unsigned short size;
- unsigned long block_number = 0L;
- unsigned long sector_number = 0L;
- unsigned long buffer, pfn;
- struct pfn_info *page;
- int s, xen_device, phys_device = 0;
+ int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
+ unsigned short nr_sects;
+ unsigned long buffer, flags;
+ int i, tot_sects;
+ pending_req_t *pending_req;
- operation = (blk_ring->ring[index].req.operation == XEN_BLOCK_WRITE) ?
- WRITE : READ;
+ /* We map virtual scatter/gather segments to physical segments. */
+ int new_segs, nr_psegs = 0;
+ phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
- /* Sectors are 512 bytes. Make sure request size is a multiple. */
- size = blk_ring->ring[index].req.block_size;
- if ( (size == 0) || (size & (0x200 - 1)) != 0 )
- {
- DPRINTK("dodgy block size: %d\n",
- blk_ring->ring[index].req.block_size);
- goto bad_descriptor;
- }
+ spin_lock_irqsave(&p->page_lock, flags);
- /* Buffer address should be sector aligned. */
- buffer = (unsigned long)blk_ring->ring[index].req.buffer;
- if ( (buffer & (0x200 - 1)) != 0 )
+ /* Check that number of segments is sane. */
+ if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
{
- DPRINTK("unaligned buffer %08lx\n", buffer);
+ DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
goto bad_descriptor;
}
- /* A request may span multiple page frames. Each must be checked. */
- for ( pfn = buffer >> PAGE_SHIFT;
- pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
- pfn++ )
+ /*
+ * Check each address/size pair is sane, and convert into a
+ * physical device and block offset. Note that if the offset and size
+ * crosses a virtual extent boundary, we may end up with more
+ * physical scatter/gather segments than virtual segments.
+ */
+ for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
{
- /* Each frame must be within bounds of machine memory. */
- if ( pfn >= max_page )
+ buffer = req->buffer_and_sects[i] & ~0x1FF;
+ nr_sects = req->buffer_and_sects[i] & 0x1FF;
+
+ if ( nr_sects == 0 )
{
- DPRINTK("pfn out of range: %08lx\n", pfn);
- goto bad_descriptor_free_frames;
+ DPRINTK("zero-sized data request\n");
+ goto bad_descriptor;
}
- page = frame_table + pfn;
+ if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
+ goto bad_descriptor;
- /* Each frame must belong to the requesting domain. */
- if ( (page->flags & PG_domain_mask) != p->domain )
+ /* Get the physical device and block index. */
+ if ( (req->device & XENDEV_TYPE_MASK) == XENDEV_VIRTUAL )
{
- DPRINTK("bad domain: expected %d, got %ld\n",
- p->domain, page->flags & PG_domain_mask);
- goto bad_descriptor_free_frames;
+ new_segs = xen_segment_map_request(
+ &phys_seg[nr_psegs], p, operation,
+ req->device,
+ req->sector_number + tot_sects,
+ buffer, nr_sects);
+ if ( new_segs <= 0 ) goto bad_descriptor;
}
-
- /* If reading into the frame, the frame must be writeable. */
- if ( operation == READ )
+ else
{
- if ( (page->flags & PG_type_mask) != PGT_writeable_page )
- {
- DPRINTK("non-writeable page passed for block read\n");
- goto bad_descriptor_free_frames;
- }
- get_page_type(page);
+ phys_seg[nr_psegs].dev = xendev_to_physdev(req->device);
+ phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
+ phys_seg[nr_psegs].buffer = buffer;
+ phys_seg[nr_psegs].nr_sects = nr_sects;
+ if ( phys_seg[nr_psegs].dev == 0 ) goto bad_descriptor;
+ new_segs = 1;
}
-
- /* Xen holds a frame reference until the operation is complete. */
- get_page_tot(page);
+
+ nr_psegs += new_segs;
+ if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
}
- atomic_inc(&nr_pending);
- bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
- if ( bh == NULL ) panic("bh is null\n");
-
- /* set just the important bits of the buffer header */
- memset (bh, 0, sizeof (struct buffer_head));
-
- xen_device = blk_ring->ring[index].req.device;
+ /* Lock pages associated with each buffer head. */
+ for ( i = 0; i < nr_psegs; i++ )
+ __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9,
+ (operation==READ));
+ spin_unlock_irqrestore(&p->page_lock, flags);
- again:
- switch ( (xen_device & XENDEV_TYPE_MASK) )
+ atomic_inc(&nr_pending);
+ pending_req = pending_reqs + pending_ring[pending_cons];
+ PENDREQ_IDX_INC(pending_cons);
+ pending_req->domain = p;
+ pending_req->id = req->id;
+ pending_req->operation = operation;
+ atomic_set(&pending_req->pendcnt, nr_psegs);
+
+ /* Now we pass each segment down to the real blkdev layer. */
+ for ( i = 0; i < nr_psegs; i++ )
{
- case XENDEV_IDE:
- xen_device &= XENDEV_IDX_MASK;
- if ( xen_device >= NR_IDE_DEVS )
- {
- DPRINTK("IDE device number out of range %d\n", xen_device);
- goto bad_descriptor_free_frames;
- }
- phys_device = ide_devs[xen_device];
- block_number = blk_ring->ring[index].req.block_number;
- sector_number = blk_ring->ring[index].req.sector_number;
- break;
-
- case XENDEV_SCSI:
- xen_device &= XENDEV_IDX_MASK;
- if ( xen_device >= NR_SCSI_DEVS )
+ bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL);
+ if ( bh == NULL ) panic("bh is null\n");
+ memset (bh, 0, sizeof (struct buffer_head));
+
+ bh->b_size = phys_seg[i].nr_sects << 9;
+ bh->b_dev = phys_seg[i].dev;
+ bh->b_rsector = phys_seg[i].sector_number;
+ bh->b_data = phys_to_virt(phys_seg[i].buffer);
+ bh->b_end_io = end_block_io_op;
+ bh->pending_req = pending_req;
+
+ if ( operation == WRITE )
{
- DPRINTK("SCSI device number out of range %d\n", xen_device);
- goto bad_descriptor_free_frames;
- }
- phys_device = scsi_devs[xen_device];
- block_number = blk_ring->ring[index].req.block_number;
- sector_number = blk_ring->ring[index].req.sector_number;
- break;
-
- case XENDEV_VIRTUAL:
- xen_device &= XENDEV_IDX_MASK;
- s = xen_segment_map_request(
- &xen_device, &block_number, &sector_number,
- p, operation, xen_device,
- blk_ring->ring[index].req.block_number,
- blk_ring->ring[index].req.sector_number);
- if ( s != 0 )
+ bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) |
+ (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write);
+ }
+ else
{
- DPRINTK("xen_seg_map_request status: %d\n", s);
- goto bad_descriptor_free_frames;
+ bh->b_state = (1 << BH_Mapped) | (1 << BH_Read);
}
- goto again; /* Loop round to convert the virt IDE/SCSI identifier. */
- default:
- DPRINTK("dispatch_rw_block_io: unknown device %d\n", xen_device);
- goto bad_descriptor_free_frames;
- }
-
- bh->b_blocknr = block_number;
- bh->b_size = size;
- bh->b_dev = phys_device;
- bh->b_rsector = sector_number;
- bh->b_data = phys_to_virt(buffer);
- bh->b_count.counter = 1;
- bh->b_end_io = end_block_io_op;
-
- /* Save meta data about request. */
- bh->b_xen_domain = p;
- bh->b_xen_id = blk_ring->ring[index].req.id;
-
- if ( operation == WRITE )
- {
- bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) |
- (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write);
- }
- else
- {
- bh->b_state = (1 << BH_Mapped) | (1 << BH_Read);
+ /* Dispatch a single request. We'll flush it to disc later. */
+ ll_rw_block(operation, 1, &bh);
}
- /* Dispatch a single request. We'll flush it to disc later. */
- ll_rw_block(operation, 1, &bh);
return;
- bad_descriptor_free_frames:
- while ( pfn > (buffer >> PAGE_SHIFT) )
- {
- page = frame_table + --pfn;
- put_page_tot(page);
- if ( operation == READ ) put_page_type(page);
- }
-
- bad_descriptor:
- DPRINTK("dispatch rw blockio bad descriptor\n");
- make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_READ, 1);
+ bad_descriptor:
+ spin_unlock_irqrestore(&p->page_lock, flags);
+ make_response(p, req->id, req->operation, 1);
}
@@ -490,8 +615,38 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
* MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
*/
-static void make_response(struct task_struct *p, void *id,
- int op, unsigned long st)
+kdev_t xendev_to_physdev(unsigned short xendev)
+{
+ switch ( (xendev & XENDEV_TYPE_MASK) )
+ {
+ case XENDEV_IDE:
+ xendev &= XENDEV_IDX_MASK;
+ if ( xendev >= NR_IDE_DEVS )
+ {
+ DPRINTK("IDE device number out of range %d\n", xendev);
+ goto fail;
+ }
+ return ide_devs[xendev];
+
+ case XENDEV_SCSI:
+ xendev &= XENDEV_IDX_MASK;
+ if ( xendev >= NR_SCSI_DEVS )
+ {
+ DPRINTK("SCSI device number out of range %d\n", xendev);
+ goto fail;
+ }
+ return scsi_devs[xendev];
+
+ default:
+ DPRINTK("xendev_to_physdev: unknown device %d\n", xendev);
+ }
+
+ fail:
+ return (kdev_t)0;
+}
+
+static void make_response(struct task_struct *p, unsigned long id,
+ unsigned short op, unsigned long st)
{
unsigned long cpu_mask, flags;
int position;
@@ -500,11 +655,11 @@ static void make_response(struct task_struct *p, void *id,
/* Place on the response ring for the relevant domain. */
spin_lock_irqsave(&p->blk_ring_lock, flags);
blk_ring = p->blk_ring_base;
- position = blk_ring->resp_prod;
+ position = p->blk_resp_prod;
blk_ring->ring[position].resp.id = id;
blk_ring->ring[position].resp.operation = op;
blk_ring->ring[position].resp.status = st;
- blk_ring->resp_prod = BLK_RING_INC(position);
+ p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
spin_unlock_irqrestore(&p->blk_ring_lock, flags);
/* Kick the relevant domain. */
@@ -517,18 +672,22 @@ static void dump_blockq(u_char key, void *dev_id, struct pt_regs *regs)
struct task_struct *p;
blk_ring_t *blk_ring ;
- printk("Dumping block queue stats: nr_pending = %d\n",
- atomic_read(&nr_pending));
+ printk("Dumping block queue stats: nr_pending = %d (prod=%d,cons=%d)\n",
+ atomic_read(&nr_pending), pending_prod, pending_cons);
p = current->next_task;
do
{
- printk (KERN_ALERT "Domain: %d\n", p->domain);
- blk_ring = p->blk_ring_base;
-
- printk(" req_prod:%d, resp_prod:%d, req_cons:%d\n",
- blk_ring->req_prod, blk_ring->resp_prod, p->blk_req_cons);
-
+ if ( !is_idle_task(p) )
+ {
+ printk("Domain: %d\n", p->domain);
+ blk_ring = p->blk_ring_base;
+
+ printk(" req_prod:%d, req_cons:%d resp_prod:%d/%d on_list=%d\n",
+ blk_ring->req_prod, p->blk_req_cons,
+ blk_ring->resp_prod, p->blk_resp_prod,
+ __on_blkdev_list(p));
+ }
p = p->next_task;
} while (p != current);
}
@@ -545,7 +704,8 @@ void init_blkdev_info(struct task_struct *p)
memset(p->segment_list, 0, sizeof(p->segment_list));
p->segment_count = 0;
- xen_refresh_segment_list(p); /* get any previously created segments */
+ /* Get any previously created segments. */
+ xen_refresh_segment_list(p);
}
/* End-of-day teardown for a domain. XXX Outstanding requests? */
@@ -558,7 +718,12 @@ void destroy_blkdev_info(struct task_struct *p)
void initialize_block_io ()
{
+ int i;
+
atomic_set(&nr_pending, 0);
+ pending_prod = pending_cons = 0;
+ memset(pending_reqs, 0, sizeof(pending_reqs));
+ for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i;
spin_lock_init(&io_schedule_list_lock);
INIT_LIST_HEAD(&io_schedule_list);
diff --git a/xen/drivers/block/xen_segment.c b/xen/drivers/block/xen_segment.c
index 62fa6a3ea6..e644aad115 100644
--- a/xen/drivers/block/xen_segment.c
+++ b/xen/drivers/block/xen_segment.c
@@ -23,70 +23,73 @@ segment_t xsegments[XEN_MAX_SEGMENTS];
* xen_device must be a valid device.
*/
+/*
+ * NB. Al offsets and sizes here are in sector units.
+ * eg. 'size == 1' means an actual size of 512 bytes.
+ */
int xen_segment_map_request(
- int *phys_device, /* out */
- unsigned long *block_number, /* out */
- unsigned long *sector_number, /* out */
- struct task_struct *domain,
- int operation,
- int segment_number,
- int xen_block_number,
- int xen_sector_number)
+ phys_seg_t *pseg, struct task_struct *p, int operation,
+ unsigned short segment_number,
+ unsigned long sect_nr, unsigned long buffer, unsigned short nr_sects)
{
segment_t *seg;
- int sum;
- int loop;
+ extent_t *ext;
+ int sum, i;
- if ( segment_number >= XEN_MAX_SEGMENTS )
- {
- /* No VHD. */
- return 1;
- }
+ if ( segment_number >= XEN_MAX_SEGMENTS ) goto fail;
- seg = domain->segment_list[segment_number];
-
- if (seg == NULL)
- {
- /* oops. no vhd exists! */
- return 1;
- }
+ seg = p->segment_list[segment_number];
+ if ( seg == NULL ) goto fail;
/* check domain permissions */
- if (seg->domain != domain->domain)
- {
- /* domain doesn't own segment */
- return 2;
- }
+ if ( seg->domain != p->domain ) goto fail;
/* check rw access */
if ((operation == WRITE && seg->mode != XEN_SEGMENT_RW) ||
(operation == READ && seg->mode == XEN_SEGMENT_UNUSED))
- {
- /* access violation */
- return 3;
- }
+ goto fail;
/* find extent, check size */
sum = 0;
- loop = 0;
- while (loop < seg->num_extents && sum <= xen_block_number)
+ i = 0;
+ ext = seg->extents;
+ while ( (i < seg->num_extents) && ((sum + ext->size) <= sect_nr) )
{
- sum += seg->extents[loop++].size;
+ sum += ext->size;
+ ext++; i++;
}
- sum -= seg->extents[--loop].size;
- if (sum + seg->extents[loop].size <= xen_block_number)
- {
- /* tried to read past the end of the segment */
- return 4;
- }
- *block_number = xen_block_number - sum + seg->extents[loop].offset;
- *sector_number = xen_sector_number - sum + seg->extents[loop].offset;;
+ if ( (sum + ext->size) <= sect_nr ) goto fail;
- /* This actually needs to be passed thru one more indirection :-) */
- *phys_device = seg->extents[loop].disk;
+ pseg->sector_number = sect_nr + ext->offset - sum;
+ pseg->buffer = buffer;
+ pseg->nr_sects = nr_sects;
+ pseg->dev = xendev_to_physdev(ext->disk);
+ if ( pseg->dev == 0 ) goto fail;
- return 0;
+ /* We're finished if the virtual extent didn't overrun the phys extent. */
+ if ( (sum + ext->size) >= (sect_nr + nr_sects) )
+ return 1; /* Just one more physical extent. */
+
+ /* Hmmm... make sure there's another extent to overrun onto! */
+ if ( (i+1) == seg->num_extents ) goto fail;
+
+ pseg[1].nr_sects = (sect_nr + nr_sects) - (sum + ext->size);
+ pseg[0].nr_sects = sum + ext->size - sect_nr;
+ pseg[1].buffer = buffer + (pseg->nr_sects << 9);
+ pseg[1].sector_number = ext[1].offset;
+ pseg[1].dev = xendev_to_physdev(ext[1].disk);
+ if ( pseg[1].dev == 0 ) goto fail;
+
+ /* We don't allow overrun onto a third physical extent. */
+ if ( (sum + ext[0].size + ext[1].size) <
+ (pseg[1].sector_number + pseg[1].nr_sects) )
+ goto fail;
+
+ return 2; /* We overran onto a second physical es\xtent. */
+
+ fail:
+ return -1;
}
/*
diff --git a/xen/drivers/ide/ide-dma.c b/xen/drivers/ide/ide-dma.c
index 6ce5fd4b1f..c4661a6fbc 100644
--- a/xen/drivers/ide/ide-dma.c
+++ b/xen/drivers/ide/ide-dma.c
@@ -271,7 +271,7 @@ static int ide_build_sglist (ide_hwif_t *hwif, struct request *rq)
/*
* continue segment from before?
*/
- if (bh_phys(bh) == lastdataend) {
+ if (virt_to_phys(bh->b_data) == lastdataend) {
sg[nents - 1].length += bh->b_size;
lastdataend += bh->b_size;
continue;
@@ -285,25 +285,9 @@ static int ide_build_sglist (ide_hwif_t *hwif, struct request *rq)
sge = &sg[nents];
memset(sge, 0, sizeof(*sge));
-
- if (bh->b_page) {
- sge->page = bh->b_page;
- sge->offset = bh_offset(bh);
- } else {
-
-
-#if 0
- /* below is wrong for xen since b_data is actually
- a 'physical / virtual' thingy. Ask KAF. */
- if (((unsigned long) bh->b_data) < PAGE_SIZE)
- BUG();
-#endif
-
- sge->address = bh->b_data;
- }
-
+ sge->address = bh->b_data;
sge->length = bh->b_size;
- lastdataend = bh_phys(bh) + bh->b_size;
+ lastdataend = virt_to_phys(bh->b_data) + bh->b_size;
nents++;
} while ((bh = bh->b_reqnext) != NULL);
diff --git a/xen/include/hypervisor-ifs/block.h b/xen/include/hypervisor-ifs/block.h
index 476af1ab54..1b228c5c85 100644
--- a/xen/include/hypervisor-ifs/block.h
+++ b/xen/include/hypervisor-ifs/block.h
@@ -34,37 +34,42 @@
*/
/* the first four definitions match fs.h */
-#define XEN_BLOCK_READ 0
-#define XEN_BLOCK_WRITE 1
-#define XEN_BLOCK_READA 2 /* currently unused */
-#define XEN_BLOCK_SPECIAL 4 /* currently unused */
-#define XEN_BLOCK_PROBE_BLK 8 /* get xhd config from hypervisor */
-#define XEN_BLOCK_DEBUG 16 /* debug */
-#define XEN_BLOCK_SEG_CREATE 32 /* create segment (vhd) */
-#define XEN_BLOCK_SEG_DELETE 64 /* delete segment (vhd) */
-#define XEN_BLOCK_PROBE_SEG 128 /* get vhd config from hypervisor */
-
-#define BLK_RING_SIZE 128
-#define BLK_RING_MAX_ENTRIES (BLK_RING_SIZE - 2)
+#define XEN_BLOCK_READ 0
+#define XEN_BLOCK_WRITE 1
+#define XEN_BLOCK_READA 2
+#define XEN_BLOCK_SPECIAL 4
+#define XEN_BLOCK_PROBE_BLK 5 /* get xhd config from hypervisor */
+#define XEN_BLOCK_DEBUG 6 /* debug */
+#define XEN_BLOCK_SEG_CREATE 7 /* create segment (vhd) */
+#define XEN_BLOCK_SEG_DELETE 8 /* delete segment (vhd) */
+#define XEN_BLOCK_PROBE_SEG 9 /* get vhd config from hypervisor */
+
+/* NB. Ring size must be small enough for sizeof(blk_ring_t) <= PAGE_SIZE. */
+#define BLK_RING_SIZE 64
#define BLK_RING_INC(_i) (((_i)+1) & (BLK_RING_SIZE-1))
-#define BLK_RING_ADD(_i,_j) (((_i)+(_j)) & (BLK_RING_SIZE-1))
+
+/*
+ * Maximum scatter/gather segments per request.
+ * This is carefully chosen so that sizeof(blk_ring_t) <= PAGE_SIZE.
+ */
+#define MAX_BLK_SEGS 12
typedef struct blk_ring_req_entry
{
- void * id; /* for guest os use */
- int operation; /* from above */
- char * buffer;
- unsigned long block_number; /* block number */
- unsigned short block_size; /* block size */
- unsigned short device;
- unsigned long sector_number; /* real buffer location on disk */
+ unsigned long id; /* private guest os value */
+ unsigned long sector_number; /* start sector idx on disk */
+ unsigned short device; /* XENDEV_??? + idx */
+ unsigned char operation; /* XEN_BLOCK_??? */
+ unsigned char nr_segments; /* number of segments */
+ /* Least 9 bits is 'nr_sects'. High 23 bits are the address. */
+ unsigned long buffer_and_sects[MAX_BLK_SEGS];
} blk_ring_req_entry_t;
typedef struct blk_ring_resp_entry
{
- void * id; /* for guest os use */
- int operation; /* from above */
- unsigned long status;
+ unsigned long id; /* copied from request */
+ unsigned short operation; /* copied from request */
+ unsigned long status; /* cuurently boolean good/bad */
} blk_ring_resp_entry_t;
typedef struct blk_ring_st
diff --git a/xen/include/xeno/blkdev.h b/xen/include/xeno/blkdev.h
index a2cd390517..7a6a6844dd 100644
--- a/xen/include/xeno/blkdev.h
+++ b/xen/include/xeno/blkdev.h
@@ -15,6 +15,15 @@
#define BLOCK_SIZE_BITS 10
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
+typedef struct {
+ struct task_struct *domain;
+ unsigned long id;
+ atomic_t pendcnt;
+ unsigned short operation;
+} pending_req_t;
+
+extern kdev_t xendev_to_physdev(unsigned short xendev);
+
extern void init_blkdev_info(struct task_struct *);
extern void destroy_blkdev_info(struct task_struct *);
@@ -61,28 +70,18 @@ enum bh_state_bits {
};
struct buffer_head {
- unsigned long b_blocknr; /* block number */
+ unsigned long b_rsector; /* Real buffer location on disk */
unsigned short b_size; /* block size */
- unsigned short b_list; /* List that this buffer appears */
kdev_t b_dev; /* device (B_FREE = free) */
-
- atomic_t b_count; /* users using this block */
- kdev_t b_rdev; /* Real device */
unsigned long b_state; /* buffer state bitmap (see above) */
-
struct buffer_head *b_reqnext; /* request queue */
-
- char * b_data; /* pointer to data block */
- struct pfn_info *b_page; /* the page this bh is mapped to */
+ char *b_data; /* pointer to data block */
void (*b_end_io)(struct buffer_head *bh, int uptodate);
-
- unsigned long b_rsector; /* Real buffer location on disk */
-
- /* Both used by b_end_io function in xen_block.c */
- void *b_xen_domain;
- void *b_xen_id;
+ pending_req_t *pending_req;
};
+#define b_rdev b_dev /* In Xen, there's no device layering (eg. s/w RAID). */
+
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
@@ -101,8 +100,6 @@ void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
extern void set_bh_page(struct buffer_head *bh, struct pfn_info *page, unsigned long offset);
-#define touch_buffer(bh) mark_page_accessed(bh->b_page)
-
#define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)
static inline void __mark_buffer_clean(struct buffer_head *bh)
@@ -261,8 +258,6 @@ struct request_queue
#endif
};
-#define bh_phys(bh) (page_to_phys((bh)->b_page) + bh_offset((bh)))
-
struct blk_dev_struct {
/*
* queue_proc has to be atomic
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index da86349149..6d1842a2ea 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -78,7 +78,8 @@ struct task_struct {
/* Block I/O */
blk_ring_t *blk_ring_base;
- unsigned int blk_req_cons; /* request consumer */
+ unsigned int blk_req_cons; /* request consumer */
+ unsigned int blk_resp_prod; /* (private version of) response producer */
struct list_head blkdev_list;
spinlock_t blk_ring_lock;
segment_t *segment_list[XEN_MAX_SEGMENTS]; /* vhd */
@@ -89,6 +90,8 @@ struct task_struct {
struct list_head run_list;
struct mm_struct mm;
+ /* We need this lock to check page types and frob reference counts. */
+ spinlock_t page_lock;
mm_segment_t addr_limit; /* thread address space:
0-0xBFFFFFFF for user-thead
diff --git a/xen/include/xeno/segment.h b/xen/include/xeno/segment.h
index abbeea278c..f6fcbb3958 100644
--- a/xen/include/xeno/segment.h
+++ b/xen/include/xeno/segment.h
@@ -3,18 +3,21 @@
#include <hypervisor-ifs/block.h>
+/* Describes a physical disk extent. */
+typedef struct {
+ unsigned short dev;
+ unsigned short nr_sects;
+ unsigned long sector_number;
+ unsigned long buffer;
+} phys_seg_t;
+
void xen_segment_initialize(void);
void xen_refresh_segment_list (struct task_struct *p);
int xen_segment_create(xv_disk_t *xvd);
int xen_segment_map_request(
- int *phys_device, /* out */
- unsigned long *block_number, /* out */
- unsigned long *sector_number, /* out */
- struct task_struct *domain,
- int operation,
- int segment_number,
- int xen_block_number,
- int xen_sector_number);
+ phys_seg_t *pseg, struct task_struct *p, int operation,
+ unsigned short segment_number,
+ unsigned long sect_nr, unsigned long buffer, unsigned short nr_sects);
#define XEN_MAX_SEGMENTS 100 /* total number of segments across all doms */
diff --git a/xen/net/dev.c b/xen/net/dev.c
index 03039e9c81..f8f8fe70cf 100644
--- a/xen/net/dev.c
+++ b/xen/net/dev.c
@@ -489,6 +489,7 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
unsigned long *g_pte;
struct pfn_info *g_pfn, *h_pfn;
unsigned int i;
+ unsigned long flags;
memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN);
if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
@@ -508,6 +509,8 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
if ( (skb->len + ETH_HLEN) < rx->size )
rx->size = skb->len + ETH_HLEN;
+ spin_lock_irqsave(&vif->domain->page_lock, flags);
+
g_pte = map_domain_mem(rx->addr);
g_pfn = frame_table + (*g_pte >> PAGE_SHIFT);
@@ -526,9 +529,11 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
*g_pte = (*g_pte & ~PAGE_MASK)
| (((h_pfn - frame_table) << PAGE_SHIFT) & PAGE_MASK);
*g_pte |= _PAGE_PRESENT;
-
+
unmap_domain_mem(g_pte);
+ spin_unlock_irqrestore(&vif->domain->page_lock, flags);
+
/* Our skbuff now points at the guest's old frame. */
skb->pf = g_pfn;
@@ -661,10 +666,12 @@ static void tx_skb_release(struct sk_buff *skb)
net_vif_t *vif = sys_vif_list[skb->src_vif];
unsigned int idx;
tx_shadow_entry_t *tx;
- unsigned long cpu_mask;
+ unsigned long cpu_mask, flags;
+ spin_lock_irqsave(&vif->domain->page_lock, flags);
for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
put_page_tot(skb_shinfo(skb)->frags[i].page);
+ spin_unlock_irqrestore(&vif->domain->page_lock, flags);
if ( skb->skb_type == SKB_NODATA )
kmem_cache_free(net_header_cachep, skb->head);
@@ -713,8 +720,7 @@ static void tx_skb_release(struct sk_buff *skb)
/* Send a transmit event if requested. */
if ( send )
{
- cpu_mask = mark_guest_event(
- sys_vif_list[skb->src_vif]->domain, _EVENT_NET_TX);
+ cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_TX);
guest_event_notify(cpu_mask);
}
}
@@ -1870,10 +1876,12 @@ long do_net_update(void)
pfn = tx.addr >> PAGE_SHIFT;
page = frame_table + pfn;
+ spin_lock_irq(&current->page_lock);
if ( (pfn >= max_page) ||
((page->flags & PG_domain_mask) != current->domain) )
{
DPRINTK("Bad page frame\n");
+ spin_unlock_irq(&current->page_lock);
continue;
}
@@ -1882,7 +1890,7 @@ long do_net_update(void)
protocol = __constant_htons(
init_tx_header(g_data, tx.size, the_dev));
if ( protocol == 0 )
- goto unmap_and_continue;
+ goto tx_unmap_and_continue;
target = __net_get_target_vif(g_data, tx.size, current_vif->id);
@@ -1890,7 +1898,7 @@ long do_net_update(void)
{
/* Local delivery */
if ( (skb = dev_alloc_skb(tx.size)) == NULL )
- goto unmap_and_continue;
+ goto tx_unmap_and_continue;
skb->destructor = tx_skb_release;
@@ -1915,15 +1923,16 @@ long do_net_update(void)
shadow_ring->tx_ring[i].header =
kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
if ( shadow_ring->tx_ring[i].header == NULL )
- goto unmap_and_continue;
+ goto tx_unmap_and_continue;
memcpy(shadow_ring->tx_ring[i].header, g_data, PKT_PROT_LEN);
shadow_ring->tx_ring[i].payload = tx.addr + PKT_PROT_LEN;
shadow_ring->tx_ring[i].status = RING_STATUS_OK;
get_page_tot(page);
}
- unmap_and_continue:
+ tx_unmap_and_continue:
unmap_domain_mem(g_data);
+ spin_unlock_irq(&current->page_lock);
}
if ( shadow_ring->tx_prod != i )
@@ -1966,10 +1975,12 @@ long do_net_update(void)
shadow_ring->rx_ring[i].status = RING_STATUS_BAD_PAGE;
+ spin_lock_irq(&current->page_lock);
if ( (pfn >= max_page) ||
(page->flags != (PGT_l1_page_table | current->domain)) )
{
DPRINTK("Bad page frame containing ppte\n");
+ spin_unlock_irq(&current->page_lock);
continue;
}
@@ -1978,8 +1989,7 @@ long do_net_update(void)
if (!(*g_pte & _PAGE_PRESENT))
{
DPRINTK("Inavlid PTE passed down (not present)\n");
- unmap_domain_mem(g_pte);
- continue;
+ goto rx_unmap_and_continue;
}
page = (*g_pte >> PAGE_SHIFT) + frame_table;
@@ -1987,8 +1997,7 @@ long do_net_update(void)
if (page->tot_count != 1)
{
DPRINTK("An rx page must be mapped exactly once\n");
- unmap_domain_mem(g_pte);
- continue;
+ goto rx_unmap_and_continue;
}
/* The pte they passed was good, so take it away from them. */
@@ -1997,7 +2006,9 @@ long do_net_update(void)
page->flags = (page->flags & ~PG_type_mask) | PGT_net_rx_buf;
rx->flush_count = tlb_flush_count[smp_processor_id()];
+ rx_unmap_and_continue:
unmap_domain_mem(g_pte);
+ spin_unlock_irq(&current->page_lock);
}
if ( shadow_ring->rx_prod != i )
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c
index c1177fa7c8..40f93cc251 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c
@@ -19,10 +19,11 @@ typedef unsigned char byte; /* from linux/ide.h */
static blk_ring_t *blk_ring;
static unsigned int resp_cons; /* Response consumer for comms ring. */
+static unsigned int req_prod; /* Private request producer. */
static xen_disk_info_t xlblk_disk_info;
static int xlblk_control_msg_pending;
-#define RING_FULL (BLK_RING_INC(blk_ring->req_prod) == resp_cons)
+#define RING_FULL (BLK_RING_INC(req_prod) == resp_cons)
/*
* Request queues with outstanding work, but ring is currently full.
@@ -33,6 +34,18 @@ static int xlblk_control_msg_pending;
static request_queue_t *pending_queues[MAX_PENDING];
static int nr_pending;
+static kdev_t sg_dev;
+static int sg_operation = -1;
+static unsigned long sg_next_sect;
+#define DISABLE_SCATTERGATHER() (sg_operation = -1)
+
+static inline void signal_requests_to_xen(void)
+{
+ DISABLE_SCATTERGATHER();
+ blk_ring->req_prod = req_prod;
+ HYPERVISOR_block_io_op();
+}
+
/* Convert from a XenoLinux major device to the Xen-level 'physical' device */
static inline unsigned short xldev_to_physdev(kdev_t xldev)
{
@@ -253,31 +266,22 @@ int xenolinux_block_revalidate(kdev_t dev)
* operation: XEN_BLOCK_{READ,WRITE,PROBE*,SEG*}
* buffer: buffer to read/write into. this should be a
* virtual address in the guest os.
- * block_number: block to read
- * block_size: size of each block
- * device: xhd*, ksd*, xvd*, ...
*/
-static int hypervisor_request(void * id,
+static int hypervisor_request(unsigned long id,
int operation,
char * buffer,
- unsigned long block_number,
- unsigned short block_size,
+ unsigned long sector_number,
+ unsigned short nr_sectors,
kdev_t device)
{
- int position;
- void *buffer_ma;
+ unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer));
kdev_t phys_device = (kdev_t) 0;
- unsigned long sector_number = 0;
struct gendisk *gd;
-
- /*
- * Bail if there's no room in the request communication ring. This may be
- * because we have a whole bunch of outstanding responses to process. No
- * matter, as the response handler will kick the request queue.
- */
- if ( RING_FULL ) return 1;
+ blk_ring_req_entry_t *req;
+ struct buffer_head *bh;
- buffer_ma = (void *)phys_to_machine(virt_to_phys(buffer));
+ if ( nr_sectors >= (1<<9) ) BUG();
+ if ( (buffer_ma & ((1<<9)-1)) != 0 ) BUG();
switch ( operation )
{
@@ -285,17 +289,42 @@ static int hypervisor_request(void * id,
case XEN_BLOCK_SEG_DELETE:
case XEN_BLOCK_PROBE_BLK:
case XEN_BLOCK_PROBE_SEG:
+ if ( RING_FULL ) return 1;
phys_device = (kdev_t) 0;
sector_number = 0;
+ DISABLE_SCATTERGATHER();
break;
case XEN_BLOCK_READ:
case XEN_BLOCK_WRITE:
phys_device = xldev_to_physdev(device);
- /* Compute real buffer location on disk */
- sector_number = block_number;
gd = xldev_to_gendisk(device);
sector_number += gd->part[MINOR(device)].start_sect;
+ if ( (sg_operation == operation) &&
+ (sg_dev == phys_device) &&
+ (sg_next_sect == sector_number) )
+ {
+ req = &blk_ring->ring[(req_prod-1)&(BLK_RING_SIZE-1)].req;
+ bh = (struct buffer_head *)id;
+ bh->b_reqnext = (struct buffer_head *)req->id;
+ req->id = id;
+ req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
+ if ( ++req->nr_segments < MAX_BLK_SEGS )
+ sg_next_sect += nr_sectors;
+ else
+ DISABLE_SCATTERGATHER();
+ return 0;
+ }
+ else if ( RING_FULL )
+ {
+ return 1;
+ }
+ else
+ {
+ sg_operation = operation;
+ sg_dev = phys_device;
+ sg_next_sect = sector_number + nr_sectors;
+ }
break;
default:
@@ -303,16 +332,14 @@ static int hypervisor_request(void * id,
}
/* Fill out a communications ring structure. */
- position = blk_ring->req_prod;
- blk_ring->ring[position].req.id = id;
- blk_ring->ring[position].req.operation = operation;
- blk_ring->ring[position].req.buffer = buffer_ma;
- blk_ring->ring[position].req.block_number = block_number;
- blk_ring->ring[position].req.block_size = block_size;
- blk_ring->ring[position].req.device = phys_device;
- blk_ring->ring[position].req.sector_number = sector_number;
-
- blk_ring->req_prod = BLK_RING_INC(position);
+ req = &blk_ring->ring[req_prod].req;
+ req->id = id;
+ req->operation = operation;
+ req->sector_number = sector_number;
+ req->device = phys_device;
+ req->nr_segments = 1;
+ req->buffer_and_sects[0] = buffer_ma | nr_sectors;
+ req_prod = BLK_RING_INC(req_prod);
return 0;
}
@@ -325,7 +352,7 @@ static int hypervisor_request(void * id,
void do_xlblk_request(request_queue_t *rq)
{
struct request *req;
- struct buffer_head *bh;
+ struct buffer_head *bh, *next_bh;
int rw, nsect, full, queued = 0;
DPRINTK("xlblk.c::do_xlblk_request for '%s'\n", DEVICE_NAME);
@@ -349,12 +376,17 @@ void do_xlblk_request(request_queue_t *rq)
bh = req->bh;
while ( bh != NULL )
{
+ next_bh = bh->b_reqnext;
+ bh->b_reqnext = NULL;
+
full = hypervisor_request(
- bh, (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE,
- bh->b_data, bh->b_rsector, bh->b_size, bh->b_dev);
+ (unsigned long)bh,
+ (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE,
+ bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev);
if ( full )
{
+ bh->b_reqnext = next_bh;
pending_queues[nr_pending++] = rq;
if ( nr_pending >= MAX_PENDING ) BUG();
goto out;
@@ -364,9 +396,7 @@ void do_xlblk_request(request_queue_t *rq)
/* Dequeue the buffer head from the request. */
nsect = bh->b_size >> 9;
- req->bh = bh->b_reqnext;
- bh->b_reqnext = NULL;
- bh = req->bh;
+ bh = req->bh = next_bh;
if ( bh != NULL )
{
@@ -389,7 +419,7 @@ void do_xlblk_request(request_queue_t *rq)
}
out:
- if ( queued != 0 ) HYPERVISOR_block_io_op();
+ if ( queued != 0 ) signal_requests_to_xen();
}
@@ -397,7 +427,7 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs)
{
int i;
unsigned long flags;
- struct buffer_head *bh;
+ struct buffer_head *bh, *next_bh;
spin_lock_irqsave(&io_request_lock, flags);
@@ -410,7 +440,14 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs)
{
case XEN_BLOCK_READ:
case XEN_BLOCK_WRITE:
- if ( (bh = bret->id) != NULL ) bh->b_end_io(bh, 1);
+ for ( bh = (struct buffer_head *)bret->id;
+ bh != NULL;
+ bh = next_bh )
+ {
+ next_bh = bh->b_reqnext;
+ bh->b_reqnext = NULL;
+ bh->b_end_io(bh, 1);
+ }
break;
case XEN_BLOCK_SEG_CREATE:
@@ -429,7 +466,7 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs)
/* We kick pending request queues if the ring is reasonably empty. */
if ( (nr_pending != 0) &&
- (((blk_ring->req_prod - resp_cons) & (BLK_RING_SIZE - 1)) <
+ (((req_prod - resp_cons) & (BLK_RING_SIZE - 1)) <
(BLK_RING_SIZE >> 1)) )
{
/* Attempt to drain the queue, but bail if the ring becomes full. */
@@ -445,13 +482,27 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs)
/* Send a synchronous message to Xen. */
-int xenolinux_control_msg(int operation, char *buffer)
+int xenolinux_control_msg(int operation, char *buffer, int size)
{
- xlblk_control_msg_pending = 1; barrier();
- if ( hypervisor_request(NULL, operation, buffer, 0, 0, 0) )
+ unsigned long flags;
+ char *aligned_buf;
+
+ /* We copy from an aligned buffer, as interface needs sector alignment. */
+ aligned_buf = get_free_page(GFP_KERNEL);
+ if ( aligned_buf == NULL ) BUG();
+
+ xlblk_control_msg_pending = 1;
+ spin_lock_irqsave(&io_request_lock, flags);
+ /* Note that size gets rounded up to a sector-sized boundary. */
+ if ( hypervisor_request(0, operation, aligned_buf, 0, (size+511)/512, 0) )
return -EAGAIN;
- HYPERVISOR_block_io_op();
- while ( xlblk_control_msg_pending ) barrier();
+ signal_requests_to_xen();
+ spin_unlock_irqrestore(&io_request_lock, flags);
+ while ( xlblk_control_msg_pending ) barrier();
+
+ memcpy(buffer, aligned_buf, size);
+ free_page(aligned_buf);
+
return 0;
}
@@ -465,7 +516,7 @@ int __init xlblk_init(void)
/* This mapping was created early at boot time. */
blk_ring = (blk_ring_t *)fix_to_virt(FIX_BLKRING_BASE);
- blk_ring->req_prod = blk_ring->resp_prod = resp_cons = 0;
+ blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
error = request_irq(XLBLK_RESPONSE_IRQ, xlblk_response_int, 0,
"xlblk-response", NULL);
@@ -478,7 +529,8 @@ int __init xlblk_init(void)
/* Probe for disk information. */
memset(&xlblk_disk_info, 0, sizeof(xlblk_disk_info));
error = xenolinux_control_msg(XEN_BLOCK_PROBE_BLK,
- (char *)&xlblk_disk_info);
+ (char *)&xlblk_disk_info,
+ sizeof(xen_disk_info_t));
if ( error )
{
printk(KERN_ALERT "Could not probe disks (%d)\n", error);
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h
index 17ca09d9f2..42da335c4a 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h
@@ -46,7 +46,7 @@ typedef struct xl_disk {
} xl_disk_t;
/* Generic layer. */
-extern int xenolinux_control_msg(int operration, char *buffer);
+extern int xenolinux_control_msg(int operration, char *buffer, int size);
extern int xenolinux_block_open(struct inode *inode, struct file *filep);
extern int xenolinux_block_release(struct inode *inode, struct file *filep);
extern int xenolinux_block_ioctl(struct inode *inode, struct file *filep,
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c
index da2086e22c..97a53fe6fd 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c
@@ -51,7 +51,7 @@ int __init xlseg_init(void)
/* Probe for disk information. */
memset(xdi, 0, sizeof(*xdi));
- xenolinux_control_msg(XEN_BLOCK_PROBE_SEG, (char *)xdi);
+ xenolinux_control_msg(XEN_BLOCK_PROBE_SEG, (char *)xdi, sizeof(*xdi));
DPRINTK("vhd block device probe:\n");
for ( i = 0; i < xdi->count; i++ )
diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c
index e7c121b683..3149be747b 100644
--- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c
+++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c
@@ -210,7 +210,7 @@ static int proc_write_vhd(struct file *file, const char *buffer,
xvd.extents[loop].size = to_number(string);
}
- xenolinux_control_msg(XEN_BLOCK_SEG_CREATE, (char *)&xvd);
+ xenolinux_control_msg(XEN_BLOCK_SEG_CREATE, (char *)&xvd, sizeof(xvd));
return count;
}