From 6607af3a4627f1e17459cd798379136763c0c86a Mon Sep 17 00:00:00 2001 From: "kaf24@labyrinth.cl.cam.ac.uk" Date: Sun, 9 Mar 2003 20:51:18 +0000 Subject: bitkeeper revision 1.115 (3e6ba94627SF_Dv66Al7guNkgaK_xg) Many files: Add scatter/gather to the Xen blkdev interface. Our write speeds are now comparable with Linux. Also fixed a few bugs. --- xen/common/dom_mem_ops.c | 8 +- xen/common/domain.c | 1 + xen/common/memory.c | 2 + xen/drivers/block/ll_rw_blk.c | 2 +- xen/drivers/block/xen_block.c | 529 ++++++++++++++------- xen/drivers/block/xen_segment.c | 93 ++-- xen/drivers/ide/ide-dma.c | 22 +- xen/include/hypervisor-ifs/block.h | 51 +- xen/include/xeno/blkdev.h | 33 +- xen/include/xeno/sched.h | 5 +- xen/include/xeno/segment.h | 19 +- xen/net/dev.c | 35 +- .../arch/xeno/drivers/block/xl_block.c | 146 ++++-- .../arch/xeno/drivers/block/xl_block.h | 2 +- .../arch/xeno/drivers/block/xl_segment.c | 2 +- .../arch/xeno/drivers/block/xl_segment_proc.c | 2 +- 16 files changed, 590 insertions(+), 362 deletions(-) diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c index 08a47652d3..e919311c5c 100644 --- a/xen/common/dom_mem_ops.c +++ b/xen/common/dom_mem_ops.c @@ -37,6 +37,7 @@ static long alloc_dom_mem(struct task_struct *p, balloon_def_op_t bop) return -ENOMEM; spin_lock_irqsave(&free_list_lock, flags); + spin_lock(&p->page_lock); temp = free_list.next; for ( i = 0; i < bop.size; i++ ) @@ -63,6 +64,7 @@ static long alloc_dom_mem(struct task_struct *p, balloon_def_op_t bop) unmap_domain_mem(va); } + spin_unlock(&p->page_lock); spin_unlock_irqrestore(&free_list_lock, flags); return bop.size; @@ -78,7 +80,8 @@ static long free_dom_mem(struct task_struct *p, balloon_inf_op_t bop) long rc = 0; spin_lock_irqsave(&free_list_lock, flags); - + spin_lock(&p->page_lock); + temp = free_list.next; for ( i = 0; i < bop.size; i++ ) { @@ -94,7 +97,7 @@ static long free_dom_mem(struct task_struct *p, balloon_inf_op_t bop) pf = &frame_table[mpfn]; if ( (pf->type_count != 0) || - (pf->type_count != 0) || + (pf->tot_count != 0) || (pf->flags != p->domain) ) { DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n", @@ -113,6 +116,7 @@ static long free_dom_mem(struct task_struct *p, balloon_inf_op_t bop) } out: + spin_unlock(&p->page_lock); spin_unlock_irqrestore(&free_list_lock, flags); return rc ? rc : bop.size; diff --git a/xen/common/domain.c b/xen/common/domain.c index 7ec09a8bb8..32bf8b7172 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -46,6 +46,7 @@ struct task_struct *do_newdomain(unsigned int dom_id, unsigned int cpu) p->processor = cpu; spin_lock_init(&p->blk_ring_lock); + spin_lock_init(&p->page_lock); p->shared_info = (void *)get_free_page(GFP_KERNEL); memset(p->shared_info, 0, PAGE_SIZE); diff --git a/xen/common/memory.c b/xen/common/memory.c index 4a0304aaf8..4b0848ea9d 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -726,6 +726,7 @@ int do_process_page_updates(page_update_request_t *ureqs, int count) err = 1; /* Least significant bits of 'ptr' demux the operation type. */ + spin_lock_irq(¤t->page_lock); switch ( req.ptr & (sizeof(l1_pgentry_t)-1) ) { /* @@ -799,6 +800,7 @@ int do_process_page_updates(page_update_request_t *ureqs, int count) MEM_LOG("Invalid page update command %08lx", req.ptr); break; } + spin_unlock_irq(¤t->page_lock); if ( err ) { diff --git a/xen/drivers/block/ll_rw_blk.c b/xen/drivers/block/ll_rw_blk.c index 615b332c4b..870b5cdb85 100644 --- a/xen/drivers/block/ll_rw_blk.c +++ b/xen/drivers/block/ll_rw_blk.c @@ -1224,7 +1224,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]) continue; /* We have the buffer lock */ - atomic_inc(&bh->b_count); + /*atomic_inc(&bh->b_count);*/ switch(rw) { case WRITE: diff --git a/xen/drivers/block/xen_block.c b/xen/drivers/block/xen_block.c index 81b4f4e75a..b72bb0640c 100644 --- a/xen/drivers/block/xen_block.c +++ b/xen/drivers/block/xen_block.c @@ -18,7 +18,7 @@ #include #include -#if 0 +#if 1 #define DPRINTK(_f, _a...) printk( _f , ## _a ) #else #define DPRINTK(_f, _a...) ((void)0) @@ -28,12 +28,30 @@ * These are rather arbitrary. They are fairly large because adjacent * requests pulled from a communication ring are quite likely to end * up being part of the same scatter/gather request at the disc. - * It might be a good idea to add scatter/gather support explicitly to - * the scatter/gather ring (eg. each request has an array of N pointers); - * then these values would better reflect real costs at the disc. + * + * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW ** + * This will increase the chances of being able to write whole tracks. + * '64' should be enough to keep us competitive with Linux. */ -#define MAX_PENDING_REQS 32 -#define BATCH_PER_DOMAIN 8 +#define MAX_PENDING_REQS 64 +#define BATCH_PER_DOMAIN 16 + +/* + * Each outstanding request which we've passed to the lower device layers + * has a 'pending_req' allocated to it. Each buffer_head that completes + * decrements the pendcnt towards zero. When it hits zero, the specified + * domain has a response queued for it, with the saved 'id' passed back. + * + * We can't allocate pending_req's in order, since they may complete out + * of order. We therefore maintain an allocation ring. This ring also + * indicates when enough work has been passed down -- at that point the + * allocation ring will be empty. + */ +static pending_req_t pending_reqs[MAX_PENDING_REQS]; +static unsigned char pending_ring[MAX_PENDING_REQS]; +static unsigned int pending_prod, pending_cons; +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; +#define PENDREQ_IDX_INC(_i) ((_i) = ((_i)+1) & (MAX_PENDING_REQS-1)) static kmem_cache_t *buffer_head_cachep; static atomic_t nr_pending; @@ -65,6 +83,18 @@ static kdev_t scsi_devs[NR_SCSI_DEVS] = { MKDEV(SCSI_DISK0_MAJOR, 224), MKDEV(SCSI_DISK0_MAJOR, 240), /* sdo, sdp */ }; +static int __buffer_is_valid(struct task_struct *p, + unsigned long buffer, + unsigned short size, + int writeable_buffer); +static void __lock_buffer(unsigned long buffer, + unsigned short size, + int writeable_buffer); +static void unlock_buffer(struct task_struct *p, + unsigned long buffer, + unsigned short size, + int writeable_buffer); + static void io_schedule(unsigned long unused); static int do_block_io_op_domain(struct task_struct *p, int max_to_do); static void dispatch_rw_block_io(struct task_struct *p, int index); @@ -73,8 +103,8 @@ static void dispatch_probe_seg(struct task_struct *p, int index); static void dispatch_debug_block_io(struct task_struct *p, int index); static void dispatch_create_segment(struct task_struct *p, int index); static void dispatch_delete_segment(struct task_struct *p, int index); -static void make_response(struct task_struct *p, void *id, int op, - unsigned long st); +static void make_response(struct task_struct *p, unsigned long id, + unsigned short op, unsigned long st); /****************************************************************** @@ -165,28 +195,27 @@ static void maybe_trigger_io_schedule(void) static void end_block_io_op(struct buffer_head *bh, int uptodate) { - struct pfn_info *page; - unsigned long pfn; + unsigned long flags; + pending_req_t *pending_req = bh->pending_req; - for ( pfn = virt_to_phys(bh->b_data) >> PAGE_SHIFT; - pfn < ((virt_to_phys(bh->b_data) + bh->b_size + PAGE_SIZE - 1) >> - PAGE_SHIFT); - pfn++ ) + unlock_buffer(pending_req->domain, + virt_to_phys(bh->b_data), + bh->b_size, + (pending_req->operation==READ)); + + if ( atomic_dec_and_test(&pending_req->pendcnt) ) { - page = frame_table + pfn; - if ( ((bh->b_state & (1 << BH_Read)) != 0) && - (put_page_type(page) == 0) ) - page->flags &= ~PG_type_mask; - put_page_tot(page); + make_response(pending_req->domain, pending_req->id, + pending_req->operation, uptodate ? 0 : 1); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[pending_prod] = pending_req - pending_reqs; + PENDREQ_IDX_INC(pending_prod); + spin_unlock_irqrestore(&pend_prod_lock, flags); + atomic_dec(&nr_pending); + maybe_trigger_io_schedule(); } - atomic_dec(&nr_pending); - make_response(bh->b_xen_domain, bh->b_xen_id, - XEN_BLOCK_READ, uptodate ? 0 : 1); - kmem_cache_free(buffer_head_cachep, bh); - - maybe_trigger_io_schedule(); } @@ -208,16 +237,105 @@ long do_block_io_op(void) * DOWNWARD CALLS -- These interface with the block-device layer proper. */ -static int do_block_io_op_domain(struct task_struct* p, int max_to_do) +static int __buffer_is_valid(struct task_struct *p, + unsigned long buffer, + unsigned short size, + int writeable_buffer) +{ + unsigned long pfn; + struct pfn_info *page; + int rc = 0; + + /* A request may span multiple page frames. Each must be checked. */ + for ( pfn = buffer >> PAGE_SHIFT; + pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); + pfn++ ) + { + /* Each frame must be within bounds of machine memory. */ + if ( pfn >= max_page ) + { + DPRINTK("pfn out of range: %08lx\n", pfn); + goto out; + } + + page = frame_table + pfn; + + /* Each frame must belong to the requesting domain. */ + if ( (page->flags & PG_domain_mask) != p->domain ) + { + DPRINTK("bad domain: expected %d, got %ld\n", + p->domain, page->flags & PG_domain_mask); + goto out; + } + + /* If reading into the frame, the frame must be writeable. */ + if ( writeable_buffer && + ((page->flags & PG_type_mask) != PGT_writeable_page) ) + { + DPRINTK("non-writeable page passed for block read\n"); + goto out; + } + } + + rc = 1; + out: + return rc; +} + +static void __lock_buffer(unsigned long buffer, + unsigned short size, + int writeable_buffer) +{ + unsigned long pfn; + struct pfn_info *page; + + for ( pfn = buffer >> PAGE_SHIFT; + pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); + pfn++ ) + { + page = frame_table + pfn; + if ( writeable_buffer ) get_page_type(page); + get_page_tot(page); + } +} + +static void unlock_buffer(struct task_struct *p, + unsigned long buffer, + unsigned short size, + int writeable_buffer) +{ + unsigned long pfn, flags; + struct pfn_info *page; + + spin_lock_irqsave(&p->page_lock, flags); + for ( pfn = buffer >> PAGE_SHIFT; + pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); + pfn++ ) + { + page = frame_table + pfn; + if ( writeable_buffer && (put_page_type(page) == 0) ) + page->flags &= ~PG_type_mask; + put_page_tot(page); + } + spin_unlock_irqrestore(&p->page_lock, flags); +} + +static int do_block_io_op_domain(struct task_struct *p, int max_to_do) { blk_ring_t *blk_ring = p->blk_ring_base; int i, more_to_do = 0; + /* + * Take items off the comms ring, taking care not to catch up + * with the response-producer index. + */ for ( i = p->blk_req_cons; - i != blk_ring->req_prod; + (i != blk_ring->req_prod) && + (((p->blk_resp_prod-i) & (BLK_RING_SIZE-1)) != 1); i = BLK_RING_INC(i) ) { - if ( max_to_do-- == 0 ) + if ( (max_to_do-- == 0) || + (atomic_read(&nr_pending) == MAX_PENDING_REQS) ) { more_to_do = 1; break; @@ -251,8 +369,11 @@ static int do_block_io_op_domain(struct task_struct* p, int max_to_do) break; default: - panic("error: unknown block io operation [%d]\n", - blk_ring->ring[i].req.operation); + DPRINTK("error: unknown block io operation [%d]\n", + blk_ring->ring[i].req.operation); + make_response(p, blk_ring->ring[i].req.id, + blk_ring->ring[i].req.operation, 1); + break; } } @@ -268,23 +389,38 @@ static void dispatch_debug_block_io(struct task_struct *p, int index) static void dispatch_create_segment(struct task_struct *p, int index) { blk_ring_t *blk_ring = p->blk_ring_base; + unsigned long flags, buffer; xv_disk_t *xvd; int result; - if (p->domain != 0) + if ( p->domain != 0 ) { DPRINTK("dispatch_create_segment called by dom%d\n", p->domain); - make_response(p, blk_ring->ring[index].req.id, - XEN_BLOCK_SEG_CREATE, 1); - return; + result = 1; + goto out; + } + + buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF; + + spin_lock_irqsave(&p->page_lock, flags); + if ( !__buffer_is_valid(p, buffer, sizeof(xv_disk_t), 1) ) + { + DPRINTK("Bad buffer in dispatch_create_segment\n"); + spin_unlock_irqrestore(&p->page_lock, flags); + result = 1; + goto out; } + __lock_buffer(buffer, sizeof(xv_disk_t), 1); + spin_unlock_irqrestore(&p->page_lock, flags); - xvd = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer); + xvd = phys_to_virt(buffer); result = xen_segment_create(xvd); + unlock_buffer(p, buffer, sizeof(xv_disk_t), 1); + + out: make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_SEG_CREATE, result); - return; } static void dispatch_delete_segment(struct task_struct *p, int index) @@ -299,13 +435,30 @@ static void dispatch_probe_blk(struct task_struct *p, int index) blk_ring_t *blk_ring = p->blk_ring_base; xen_disk_info_t *xdi; + unsigned long flags, buffer; + int rc = 0; + + buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF; - xdi = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer); + spin_lock_irqsave(&p->page_lock, flags); + if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) ) + { + DPRINTK("Bad buffer in dispatch_probe_blk\n"); + spin_unlock_irqrestore(&p->page_lock, flags); + rc = 1; + goto out; + } + __lock_buffer(buffer, sizeof(xen_disk_info_t), 1); + spin_unlock_irqrestore(&p->page_lock, flags); + xdi = phys_to_virt(buffer); ide_probe_devices(xdi); scsi_probe_devices(xdi); - make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_BLK, 0); + unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1); + + out: + make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_BLK, rc); } static void dispatch_probe_seg(struct task_struct *p, int index) @@ -313,175 +466,147 @@ static void dispatch_probe_seg(struct task_struct *p, int index) extern void xen_segment_probe(xen_disk_info_t *xdi); blk_ring_t *blk_ring = p->blk_ring_base; xen_disk_info_t *xdi; + unsigned long flags, buffer; + int rc = 0; + + buffer = blk_ring->ring[index].req.buffer_and_sects[0] & ~0x1FF; + + spin_lock_irqsave(&p->page_lock, flags); + if ( !__buffer_is_valid(p, buffer, sizeof(xen_disk_info_t), 1) ) + { + DPRINTK("Bad buffer in dispatch_probe_seg\n"); + spin_unlock_irqrestore(&p->page_lock, flags); + rc = 1; + goto out; + } + __lock_buffer(buffer, sizeof(xen_disk_info_t), 1); + spin_unlock_irqrestore(&p->page_lock, flags); - xdi = phys_to_virt((unsigned long)blk_ring->ring[index].req.buffer); + xdi = phys_to_virt(buffer); xen_segment_probe(xdi); - make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG, 0); + unlock_buffer(p, buffer, sizeof(xen_disk_info_t), 1); + + out: + make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_PROBE_SEG, rc); } static void dispatch_rw_block_io(struct task_struct *p, int index) { extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); blk_ring_t *blk_ring = p->blk_ring_base; + blk_ring_req_entry_t *req = &blk_ring->ring[index].req; struct buffer_head *bh; - int operation; - unsigned short size; - unsigned long block_number = 0L; - unsigned long sector_number = 0L; - unsigned long buffer, pfn; - struct pfn_info *page; - int s, xen_device, phys_device = 0; + int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ; + unsigned short nr_sects; + unsigned long buffer, flags; + int i, tot_sects; + pending_req_t *pending_req; - operation = (blk_ring->ring[index].req.operation == XEN_BLOCK_WRITE) ? - WRITE : READ; + /* We map virtual scatter/gather segments to physical segments. */ + int new_segs, nr_psegs = 0; + phys_seg_t phys_seg[MAX_BLK_SEGS * 2]; - /* Sectors are 512 bytes. Make sure request size is a multiple. */ - size = blk_ring->ring[index].req.block_size; - if ( (size == 0) || (size & (0x200 - 1)) != 0 ) - { - DPRINTK("dodgy block size: %d\n", - blk_ring->ring[index].req.block_size); - goto bad_descriptor; - } + spin_lock_irqsave(&p->page_lock, flags); - /* Buffer address should be sector aligned. */ - buffer = (unsigned long)blk_ring->ring[index].req.buffer; - if ( (buffer & (0x200 - 1)) != 0 ) + /* Check that number of segments is sane. */ + if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) ) { - DPRINTK("unaligned buffer %08lx\n", buffer); + DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); goto bad_descriptor; } - /* A request may span multiple page frames. Each must be checked. */ - for ( pfn = buffer >> PAGE_SHIFT; - pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); - pfn++ ) + /* + * Check each address/size pair is sane, and convert into a + * physical device and block offset. Note that if the offset and size + * crosses a virtual extent boundary, we may end up with more + * physical scatter/gather segments than virtual segments. + */ + for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) { - /* Each frame must be within bounds of machine memory. */ - if ( pfn >= max_page ) + buffer = req->buffer_and_sects[i] & ~0x1FF; + nr_sects = req->buffer_and_sects[i] & 0x1FF; + + if ( nr_sects == 0 ) { - DPRINTK("pfn out of range: %08lx\n", pfn); - goto bad_descriptor_free_frames; + DPRINTK("zero-sized data request\n"); + goto bad_descriptor; } - page = frame_table + pfn; + if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) ) + goto bad_descriptor; - /* Each frame must belong to the requesting domain. */ - if ( (page->flags & PG_domain_mask) != p->domain ) + /* Get the physical device and block index. */ + if ( (req->device & XENDEV_TYPE_MASK) == XENDEV_VIRTUAL ) { - DPRINTK("bad domain: expected %d, got %ld\n", - p->domain, page->flags & PG_domain_mask); - goto bad_descriptor_free_frames; + new_segs = xen_segment_map_request( + &phys_seg[nr_psegs], p, operation, + req->device, + req->sector_number + tot_sects, + buffer, nr_sects); + if ( new_segs <= 0 ) goto bad_descriptor; } - - /* If reading into the frame, the frame must be writeable. */ - if ( operation == READ ) + else { - if ( (page->flags & PG_type_mask) != PGT_writeable_page ) - { - DPRINTK("non-writeable page passed for block read\n"); - goto bad_descriptor_free_frames; - } - get_page_type(page); + phys_seg[nr_psegs].dev = xendev_to_physdev(req->device); + phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; + phys_seg[nr_psegs].buffer = buffer; + phys_seg[nr_psegs].nr_sects = nr_sects; + if ( phys_seg[nr_psegs].dev == 0 ) goto bad_descriptor; + new_segs = 1; } - - /* Xen holds a frame reference until the operation is complete. */ - get_page_tot(page); + + nr_psegs += new_segs; + if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG(); } - atomic_inc(&nr_pending); - bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL); - if ( bh == NULL ) panic("bh is null\n"); - - /* set just the important bits of the buffer header */ - memset (bh, 0, sizeof (struct buffer_head)); - - xen_device = blk_ring->ring[index].req.device; + /* Lock pages associated with each buffer head. */ + for ( i = 0; i < nr_psegs; i++ ) + __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, + (operation==READ)); + spin_unlock_irqrestore(&p->page_lock, flags); - again: - switch ( (xen_device & XENDEV_TYPE_MASK) ) + atomic_inc(&nr_pending); + pending_req = pending_reqs + pending_ring[pending_cons]; + PENDREQ_IDX_INC(pending_cons); + pending_req->domain = p; + pending_req->id = req->id; + pending_req->operation = operation; + atomic_set(&pending_req->pendcnt, nr_psegs); + + /* Now we pass each segment down to the real blkdev layer. */ + for ( i = 0; i < nr_psegs; i++ ) { - case XENDEV_IDE: - xen_device &= XENDEV_IDX_MASK; - if ( xen_device >= NR_IDE_DEVS ) - { - DPRINTK("IDE device number out of range %d\n", xen_device); - goto bad_descriptor_free_frames; - } - phys_device = ide_devs[xen_device]; - block_number = blk_ring->ring[index].req.block_number; - sector_number = blk_ring->ring[index].req.sector_number; - break; - - case XENDEV_SCSI: - xen_device &= XENDEV_IDX_MASK; - if ( xen_device >= NR_SCSI_DEVS ) + bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL); + if ( bh == NULL ) panic("bh is null\n"); + memset (bh, 0, sizeof (struct buffer_head)); + + bh->b_size = phys_seg[i].nr_sects << 9; + bh->b_dev = phys_seg[i].dev; + bh->b_rsector = phys_seg[i].sector_number; + bh->b_data = phys_to_virt(phys_seg[i].buffer); + bh->b_end_io = end_block_io_op; + bh->pending_req = pending_req; + + if ( operation == WRITE ) { - DPRINTK("SCSI device number out of range %d\n", xen_device); - goto bad_descriptor_free_frames; - } - phys_device = scsi_devs[xen_device]; - block_number = blk_ring->ring[index].req.block_number; - sector_number = blk_ring->ring[index].req.sector_number; - break; - - case XENDEV_VIRTUAL: - xen_device &= XENDEV_IDX_MASK; - s = xen_segment_map_request( - &xen_device, &block_number, §or_number, - p, operation, xen_device, - blk_ring->ring[index].req.block_number, - blk_ring->ring[index].req.sector_number); - if ( s != 0 ) + bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) | + (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write); + } + else { - DPRINTK("xen_seg_map_request status: %d\n", s); - goto bad_descriptor_free_frames; + bh->b_state = (1 << BH_Mapped) | (1 << BH_Read); } - goto again; /* Loop round to convert the virt IDE/SCSI identifier. */ - default: - DPRINTK("dispatch_rw_block_io: unknown device %d\n", xen_device); - goto bad_descriptor_free_frames; - } - - bh->b_blocknr = block_number; - bh->b_size = size; - bh->b_dev = phys_device; - bh->b_rsector = sector_number; - bh->b_data = phys_to_virt(buffer); - bh->b_count.counter = 1; - bh->b_end_io = end_block_io_op; - - /* Save meta data about request. */ - bh->b_xen_domain = p; - bh->b_xen_id = blk_ring->ring[index].req.id; - - if ( operation == WRITE ) - { - bh->b_state = (1 << BH_JBD) | (1 << BH_Mapped) | (1 << BH_Req) | - (1 << BH_Dirty) | (1 << BH_Uptodate) | (1 << BH_Write); - } - else - { - bh->b_state = (1 << BH_Mapped) | (1 << BH_Read); + /* Dispatch a single request. We'll flush it to disc later. */ + ll_rw_block(operation, 1, &bh); } - /* Dispatch a single request. We'll flush it to disc later. */ - ll_rw_block(operation, 1, &bh); return; - bad_descriptor_free_frames: - while ( pfn > (buffer >> PAGE_SHIFT) ) - { - page = frame_table + --pfn; - put_page_tot(page); - if ( operation == READ ) put_page_type(page); - } - - bad_descriptor: - DPRINTK("dispatch rw blockio bad descriptor\n"); - make_response(p, blk_ring->ring[index].req.id, XEN_BLOCK_READ, 1); + bad_descriptor: + spin_unlock_irqrestore(&p->page_lock, flags); + make_response(p, req->id, req->operation, 1); } @@ -490,8 +615,38 @@ static void dispatch_rw_block_io(struct task_struct *p, int index) * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING */ -static void make_response(struct task_struct *p, void *id, - int op, unsigned long st) +kdev_t xendev_to_physdev(unsigned short xendev) +{ + switch ( (xendev & XENDEV_TYPE_MASK) ) + { + case XENDEV_IDE: + xendev &= XENDEV_IDX_MASK; + if ( xendev >= NR_IDE_DEVS ) + { + DPRINTK("IDE device number out of range %d\n", xendev); + goto fail; + } + return ide_devs[xendev]; + + case XENDEV_SCSI: + xendev &= XENDEV_IDX_MASK; + if ( xendev >= NR_SCSI_DEVS ) + { + DPRINTK("SCSI device number out of range %d\n", xendev); + goto fail; + } + return scsi_devs[xendev]; + + default: + DPRINTK("xendev_to_physdev: unknown device %d\n", xendev); + } + + fail: + return (kdev_t)0; +} + +static void make_response(struct task_struct *p, unsigned long id, + unsigned short op, unsigned long st) { unsigned long cpu_mask, flags; int position; @@ -500,11 +655,11 @@ static void make_response(struct task_struct *p, void *id, /* Place on the response ring for the relevant domain. */ spin_lock_irqsave(&p->blk_ring_lock, flags); blk_ring = p->blk_ring_base; - position = blk_ring->resp_prod; + position = p->blk_resp_prod; blk_ring->ring[position].resp.id = id; blk_ring->ring[position].resp.operation = op; blk_ring->ring[position].resp.status = st; - blk_ring->resp_prod = BLK_RING_INC(position); + p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position); spin_unlock_irqrestore(&p->blk_ring_lock, flags); /* Kick the relevant domain. */ @@ -517,18 +672,22 @@ static void dump_blockq(u_char key, void *dev_id, struct pt_regs *regs) struct task_struct *p; blk_ring_t *blk_ring ; - printk("Dumping block queue stats: nr_pending = %d\n", - atomic_read(&nr_pending)); + printk("Dumping block queue stats: nr_pending = %d (prod=%d,cons=%d)\n", + atomic_read(&nr_pending), pending_prod, pending_cons); p = current->next_task; do { - printk (KERN_ALERT "Domain: %d\n", p->domain); - blk_ring = p->blk_ring_base; - - printk(" req_prod:%d, resp_prod:%d, req_cons:%d\n", - blk_ring->req_prod, blk_ring->resp_prod, p->blk_req_cons); - + if ( !is_idle_task(p) ) + { + printk("Domain: %d\n", p->domain); + blk_ring = p->blk_ring_base; + + printk(" req_prod:%d, req_cons:%d resp_prod:%d/%d on_list=%d\n", + blk_ring->req_prod, p->blk_req_cons, + blk_ring->resp_prod, p->blk_resp_prod, + __on_blkdev_list(p)); + } p = p->next_task; } while (p != current); } @@ -545,7 +704,8 @@ void init_blkdev_info(struct task_struct *p) memset(p->segment_list, 0, sizeof(p->segment_list)); p->segment_count = 0; - xen_refresh_segment_list(p); /* get any previously created segments */ + /* Get any previously created segments. */ + xen_refresh_segment_list(p); } /* End-of-day teardown for a domain. XXX Outstanding requests? */ @@ -558,7 +718,12 @@ void destroy_blkdev_info(struct task_struct *p) void initialize_block_io () { + int i; + atomic_set(&nr_pending, 0); + pending_prod = pending_cons = 0; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i; spin_lock_init(&io_schedule_list_lock); INIT_LIST_HEAD(&io_schedule_list); diff --git a/xen/drivers/block/xen_segment.c b/xen/drivers/block/xen_segment.c index 62fa6a3ea6..e644aad115 100644 --- a/xen/drivers/block/xen_segment.c +++ b/xen/drivers/block/xen_segment.c @@ -23,70 +23,73 @@ segment_t xsegments[XEN_MAX_SEGMENTS]; * xen_device must be a valid device. */ +/* + * NB. Al offsets and sizes here are in sector units. + * eg. 'size == 1' means an actual size of 512 bytes. + */ int xen_segment_map_request( - int *phys_device, /* out */ - unsigned long *block_number, /* out */ - unsigned long *sector_number, /* out */ - struct task_struct *domain, - int operation, - int segment_number, - int xen_block_number, - int xen_sector_number) + phys_seg_t *pseg, struct task_struct *p, int operation, + unsigned short segment_number, + unsigned long sect_nr, unsigned long buffer, unsigned short nr_sects) { segment_t *seg; - int sum; - int loop; + extent_t *ext; + int sum, i; - if ( segment_number >= XEN_MAX_SEGMENTS ) - { - /* No VHD. */ - return 1; - } + if ( segment_number >= XEN_MAX_SEGMENTS ) goto fail; - seg = domain->segment_list[segment_number]; - - if (seg == NULL) - { - /* oops. no vhd exists! */ - return 1; - } + seg = p->segment_list[segment_number]; + if ( seg == NULL ) goto fail; /* check domain permissions */ - if (seg->domain != domain->domain) - { - /* domain doesn't own segment */ - return 2; - } + if ( seg->domain != p->domain ) goto fail; /* check rw access */ if ((operation == WRITE && seg->mode != XEN_SEGMENT_RW) || (operation == READ && seg->mode == XEN_SEGMENT_UNUSED)) - { - /* access violation */ - return 3; - } + goto fail; /* find extent, check size */ sum = 0; - loop = 0; - while (loop < seg->num_extents && sum <= xen_block_number) + i = 0; + ext = seg->extents; + while ( (i < seg->num_extents) && ((sum + ext->size) <= sect_nr) ) { - sum += seg->extents[loop++].size; + sum += ext->size; + ext++; i++; } - sum -= seg->extents[--loop].size; - if (sum + seg->extents[loop].size <= xen_block_number) - { - /* tried to read past the end of the segment */ - return 4; - } - *block_number = xen_block_number - sum + seg->extents[loop].offset; - *sector_number = xen_sector_number - sum + seg->extents[loop].offset;; + if ( (sum + ext->size) <= sect_nr ) goto fail; - /* This actually needs to be passed thru one more indirection :-) */ - *phys_device = seg->extents[loop].disk; + pseg->sector_number = sect_nr + ext->offset - sum; + pseg->buffer = buffer; + pseg->nr_sects = nr_sects; + pseg->dev = xendev_to_physdev(ext->disk); + if ( pseg->dev == 0 ) goto fail; - return 0; + /* We're finished if the virtual extent didn't overrun the phys extent. */ + if ( (sum + ext->size) >= (sect_nr + nr_sects) ) + return 1; /* Just one more physical extent. */ + + /* Hmmm... make sure there's another extent to overrun onto! */ + if ( (i+1) == seg->num_extents ) goto fail; + + pseg[1].nr_sects = (sect_nr + nr_sects) - (sum + ext->size); + pseg[0].nr_sects = sum + ext->size - sect_nr; + pseg[1].buffer = buffer + (pseg->nr_sects << 9); + pseg[1].sector_number = ext[1].offset; + pseg[1].dev = xendev_to_physdev(ext[1].disk); + if ( pseg[1].dev == 0 ) goto fail; + + /* We don't allow overrun onto a third physical extent. */ + if ( (sum + ext[0].size + ext[1].size) < + (pseg[1].sector_number + pseg[1].nr_sects) ) + goto fail; + + return 2; /* We overran onto a second physical es\xtent. */ + + fail: + return -1; } /* diff --git a/xen/drivers/ide/ide-dma.c b/xen/drivers/ide/ide-dma.c index 6ce5fd4b1f..c4661a6fbc 100644 --- a/xen/drivers/ide/ide-dma.c +++ b/xen/drivers/ide/ide-dma.c @@ -271,7 +271,7 @@ static int ide_build_sglist (ide_hwif_t *hwif, struct request *rq) /* * continue segment from before? */ - if (bh_phys(bh) == lastdataend) { + if (virt_to_phys(bh->b_data) == lastdataend) { sg[nents - 1].length += bh->b_size; lastdataend += bh->b_size; continue; @@ -285,25 +285,9 @@ static int ide_build_sglist (ide_hwif_t *hwif, struct request *rq) sge = &sg[nents]; memset(sge, 0, sizeof(*sge)); - - if (bh->b_page) { - sge->page = bh->b_page; - sge->offset = bh_offset(bh); - } else { - - -#if 0 - /* below is wrong for xen since b_data is actually - a 'physical / virtual' thingy. Ask KAF. */ - if (((unsigned long) bh->b_data) < PAGE_SIZE) - BUG(); -#endif - - sge->address = bh->b_data; - } - + sge->address = bh->b_data; sge->length = bh->b_size; - lastdataend = bh_phys(bh) + bh->b_size; + lastdataend = virt_to_phys(bh->b_data) + bh->b_size; nents++; } while ((bh = bh->b_reqnext) != NULL); diff --git a/xen/include/hypervisor-ifs/block.h b/xen/include/hypervisor-ifs/block.h index 476af1ab54..1b228c5c85 100644 --- a/xen/include/hypervisor-ifs/block.h +++ b/xen/include/hypervisor-ifs/block.h @@ -34,37 +34,42 @@ */ /* the first four definitions match fs.h */ -#define XEN_BLOCK_READ 0 -#define XEN_BLOCK_WRITE 1 -#define XEN_BLOCK_READA 2 /* currently unused */ -#define XEN_BLOCK_SPECIAL 4 /* currently unused */ -#define XEN_BLOCK_PROBE_BLK 8 /* get xhd config from hypervisor */ -#define XEN_BLOCK_DEBUG 16 /* debug */ -#define XEN_BLOCK_SEG_CREATE 32 /* create segment (vhd) */ -#define XEN_BLOCK_SEG_DELETE 64 /* delete segment (vhd) */ -#define XEN_BLOCK_PROBE_SEG 128 /* get vhd config from hypervisor */ - -#define BLK_RING_SIZE 128 -#define BLK_RING_MAX_ENTRIES (BLK_RING_SIZE - 2) +#define XEN_BLOCK_READ 0 +#define XEN_BLOCK_WRITE 1 +#define XEN_BLOCK_READA 2 +#define XEN_BLOCK_SPECIAL 4 +#define XEN_BLOCK_PROBE_BLK 5 /* get xhd config from hypervisor */ +#define XEN_BLOCK_DEBUG 6 /* debug */ +#define XEN_BLOCK_SEG_CREATE 7 /* create segment (vhd) */ +#define XEN_BLOCK_SEG_DELETE 8 /* delete segment (vhd) */ +#define XEN_BLOCK_PROBE_SEG 9 /* get vhd config from hypervisor */ + +/* NB. Ring size must be small enough for sizeof(blk_ring_t) <= PAGE_SIZE. */ +#define BLK_RING_SIZE 64 #define BLK_RING_INC(_i) (((_i)+1) & (BLK_RING_SIZE-1)) -#define BLK_RING_ADD(_i,_j) (((_i)+(_j)) & (BLK_RING_SIZE-1)) + +/* + * Maximum scatter/gather segments per request. + * This is carefully chosen so that sizeof(blk_ring_t) <= PAGE_SIZE. + */ +#define MAX_BLK_SEGS 12 typedef struct blk_ring_req_entry { - void * id; /* for guest os use */ - int operation; /* from above */ - char * buffer; - unsigned long block_number; /* block number */ - unsigned short block_size; /* block size */ - unsigned short device; - unsigned long sector_number; /* real buffer location on disk */ + unsigned long id; /* private guest os value */ + unsigned long sector_number; /* start sector idx on disk */ + unsigned short device; /* XENDEV_??? + idx */ + unsigned char operation; /* XEN_BLOCK_??? */ + unsigned char nr_segments; /* number of segments */ + /* Least 9 bits is 'nr_sects'. High 23 bits are the address. */ + unsigned long buffer_and_sects[MAX_BLK_SEGS]; } blk_ring_req_entry_t; typedef struct blk_ring_resp_entry { - void * id; /* for guest os use */ - int operation; /* from above */ - unsigned long status; + unsigned long id; /* copied from request */ + unsigned short operation; /* copied from request */ + unsigned long status; /* cuurently boolean good/bad */ } blk_ring_resp_entry_t; typedef struct blk_ring_st diff --git a/xen/include/xeno/blkdev.h b/xen/include/xeno/blkdev.h index a2cd390517..7a6a6844dd 100644 --- a/xen/include/xeno/blkdev.h +++ b/xen/include/xeno/blkdev.h @@ -15,6 +15,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1<b_page) - #define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state) static inline void __mark_buffer_clean(struct buffer_head *bh) @@ -261,8 +258,6 @@ struct request_queue #endif }; -#define bh_phys(bh) (page_to_phys((bh)->b_page) + bh_offset((bh))) - struct blk_dev_struct { /* * queue_proc has to be atomic diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h index da86349149..6d1842a2ea 100644 --- a/xen/include/xeno/sched.h +++ b/xen/include/xeno/sched.h @@ -78,7 +78,8 @@ struct task_struct { /* Block I/O */ blk_ring_t *blk_ring_base; - unsigned int blk_req_cons; /* request consumer */ + unsigned int blk_req_cons; /* request consumer */ + unsigned int blk_resp_prod; /* (private version of) response producer */ struct list_head blkdev_list; spinlock_t blk_ring_lock; segment_t *segment_list[XEN_MAX_SEGMENTS]; /* vhd */ @@ -89,6 +90,8 @@ struct task_struct { struct list_head run_list; struct mm_struct mm; + /* We need this lock to check page types and frob reference counts. */ + spinlock_t page_lock; mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead diff --git a/xen/include/xeno/segment.h b/xen/include/xeno/segment.h index abbeea278c..f6fcbb3958 100644 --- a/xen/include/xeno/segment.h +++ b/xen/include/xeno/segment.h @@ -3,18 +3,21 @@ #include +/* Describes a physical disk extent. */ +typedef struct { + unsigned short dev; + unsigned short nr_sects; + unsigned long sector_number; + unsigned long buffer; +} phys_seg_t; + void xen_segment_initialize(void); void xen_refresh_segment_list (struct task_struct *p); int xen_segment_create(xv_disk_t *xvd); int xen_segment_map_request( - int *phys_device, /* out */ - unsigned long *block_number, /* out */ - unsigned long *sector_number, /* out */ - struct task_struct *domain, - int operation, - int segment_number, - int xen_block_number, - int xen_sector_number); + phys_seg_t *pseg, struct task_struct *p, int operation, + unsigned short segment_number, + unsigned long sect_nr, unsigned long buffer, unsigned short nr_sects); #define XEN_MAX_SEGMENTS 100 /* total number of segments across all doms */ diff --git a/xen/net/dev.c b/xen/net/dev.c index 03039e9c81..f8f8fe70cf 100644 --- a/xen/net/dev.c +++ b/xen/net/dev.c @@ -489,6 +489,7 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) unsigned long *g_pte; struct pfn_info *g_pfn, *h_pfn; unsigned int i; + unsigned long flags; memset(skb->mac.ethernet->h_dest, 0, ETH_ALEN); if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP ) @@ -508,6 +509,8 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) if ( (skb->len + ETH_HLEN) < rx->size ) rx->size = skb->len + ETH_HLEN; + spin_lock_irqsave(&vif->domain->page_lock, flags); + g_pte = map_domain_mem(rx->addr); g_pfn = frame_table + (*g_pte >> PAGE_SHIFT); @@ -526,9 +529,11 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) *g_pte = (*g_pte & ~PAGE_MASK) | (((h_pfn - frame_table) << PAGE_SHIFT) & PAGE_MASK); *g_pte |= _PAGE_PRESENT; - + unmap_domain_mem(g_pte); + spin_unlock_irqrestore(&vif->domain->page_lock, flags); + /* Our skbuff now points at the guest's old frame. */ skb->pf = g_pfn; @@ -661,10 +666,12 @@ static void tx_skb_release(struct sk_buff *skb) net_vif_t *vif = sys_vif_list[skb->src_vif]; unsigned int idx; tx_shadow_entry_t *tx; - unsigned long cpu_mask; + unsigned long cpu_mask, flags; + spin_lock_irqsave(&vif->domain->page_lock, flags); for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ ) put_page_tot(skb_shinfo(skb)->frags[i].page); + spin_unlock_irqrestore(&vif->domain->page_lock, flags); if ( skb->skb_type == SKB_NODATA ) kmem_cache_free(net_header_cachep, skb->head); @@ -713,8 +720,7 @@ static void tx_skb_release(struct sk_buff *skb) /* Send a transmit event if requested. */ if ( send ) { - cpu_mask = mark_guest_event( - sys_vif_list[skb->src_vif]->domain, _EVENT_NET_TX); + cpu_mask = mark_guest_event(vif->domain, _EVENT_NET_TX); guest_event_notify(cpu_mask); } } @@ -1870,10 +1876,12 @@ long do_net_update(void) pfn = tx.addr >> PAGE_SHIFT; page = frame_table + pfn; + spin_lock_irq(¤t->page_lock); if ( (pfn >= max_page) || ((page->flags & PG_domain_mask) != current->domain) ) { DPRINTK("Bad page frame\n"); + spin_unlock_irq(¤t->page_lock); continue; } @@ -1882,7 +1890,7 @@ long do_net_update(void) protocol = __constant_htons( init_tx_header(g_data, tx.size, the_dev)); if ( protocol == 0 ) - goto unmap_and_continue; + goto tx_unmap_and_continue; target = __net_get_target_vif(g_data, tx.size, current_vif->id); @@ -1890,7 +1898,7 @@ long do_net_update(void) { /* Local delivery */ if ( (skb = dev_alloc_skb(tx.size)) == NULL ) - goto unmap_and_continue; + goto tx_unmap_and_continue; skb->destructor = tx_skb_release; @@ -1915,15 +1923,16 @@ long do_net_update(void) shadow_ring->tx_ring[i].header = kmem_cache_alloc(net_header_cachep, GFP_KERNEL); if ( shadow_ring->tx_ring[i].header == NULL ) - goto unmap_and_continue; + goto tx_unmap_and_continue; memcpy(shadow_ring->tx_ring[i].header, g_data, PKT_PROT_LEN); shadow_ring->tx_ring[i].payload = tx.addr + PKT_PROT_LEN; shadow_ring->tx_ring[i].status = RING_STATUS_OK; get_page_tot(page); } - unmap_and_continue: + tx_unmap_and_continue: unmap_domain_mem(g_data); + spin_unlock_irq(¤t->page_lock); } if ( shadow_ring->tx_prod != i ) @@ -1966,10 +1975,12 @@ long do_net_update(void) shadow_ring->rx_ring[i].status = RING_STATUS_BAD_PAGE; + spin_lock_irq(¤t->page_lock); if ( (pfn >= max_page) || (page->flags != (PGT_l1_page_table | current->domain)) ) { DPRINTK("Bad page frame containing ppte\n"); + spin_unlock_irq(¤t->page_lock); continue; } @@ -1978,8 +1989,7 @@ long do_net_update(void) if (!(*g_pte & _PAGE_PRESENT)) { DPRINTK("Inavlid PTE passed down (not present)\n"); - unmap_domain_mem(g_pte); - continue; + goto rx_unmap_and_continue; } page = (*g_pte >> PAGE_SHIFT) + frame_table; @@ -1987,8 +1997,7 @@ long do_net_update(void) if (page->tot_count != 1) { DPRINTK("An rx page must be mapped exactly once\n"); - unmap_domain_mem(g_pte); - continue; + goto rx_unmap_and_continue; } /* The pte they passed was good, so take it away from them. */ @@ -1997,7 +2006,9 @@ long do_net_update(void) page->flags = (page->flags & ~PG_type_mask) | PGT_net_rx_buf; rx->flush_count = tlb_flush_count[smp_processor_id()]; + rx_unmap_and_continue: unmap_domain_mem(g_pte); + spin_unlock_irq(¤t->page_lock); } if ( shadow_ring->rx_prod != i ) diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c index c1177fa7c8..40f93cc251 100644 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.c @@ -19,10 +19,11 @@ typedef unsigned char byte; /* from linux/ide.h */ static blk_ring_t *blk_ring; static unsigned int resp_cons; /* Response consumer for comms ring. */ +static unsigned int req_prod; /* Private request producer. */ static xen_disk_info_t xlblk_disk_info; static int xlblk_control_msg_pending; -#define RING_FULL (BLK_RING_INC(blk_ring->req_prod) == resp_cons) +#define RING_FULL (BLK_RING_INC(req_prod) == resp_cons) /* * Request queues with outstanding work, but ring is currently full. @@ -33,6 +34,18 @@ static int xlblk_control_msg_pending; static request_queue_t *pending_queues[MAX_PENDING]; static int nr_pending; +static kdev_t sg_dev; +static int sg_operation = -1; +static unsigned long sg_next_sect; +#define DISABLE_SCATTERGATHER() (sg_operation = -1) + +static inline void signal_requests_to_xen(void) +{ + DISABLE_SCATTERGATHER(); + blk_ring->req_prod = req_prod; + HYPERVISOR_block_io_op(); +} + /* Convert from a XenoLinux major device to the Xen-level 'physical' device */ static inline unsigned short xldev_to_physdev(kdev_t xldev) { @@ -253,31 +266,22 @@ int xenolinux_block_revalidate(kdev_t dev) * operation: XEN_BLOCK_{READ,WRITE,PROBE*,SEG*} * buffer: buffer to read/write into. this should be a * virtual address in the guest os. - * block_number: block to read - * block_size: size of each block - * device: xhd*, ksd*, xvd*, ... */ -static int hypervisor_request(void * id, +static int hypervisor_request(unsigned long id, int operation, char * buffer, - unsigned long block_number, - unsigned short block_size, + unsigned long sector_number, + unsigned short nr_sectors, kdev_t device) { - int position; - void *buffer_ma; + unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); kdev_t phys_device = (kdev_t) 0; - unsigned long sector_number = 0; struct gendisk *gd; - - /* - * Bail if there's no room in the request communication ring. This may be - * because we have a whole bunch of outstanding responses to process. No - * matter, as the response handler will kick the request queue. - */ - if ( RING_FULL ) return 1; + blk_ring_req_entry_t *req; + struct buffer_head *bh; - buffer_ma = (void *)phys_to_machine(virt_to_phys(buffer)); + if ( nr_sectors >= (1<<9) ) BUG(); + if ( (buffer_ma & ((1<<9)-1)) != 0 ) BUG(); switch ( operation ) { @@ -285,17 +289,42 @@ static int hypervisor_request(void * id, case XEN_BLOCK_SEG_DELETE: case XEN_BLOCK_PROBE_BLK: case XEN_BLOCK_PROBE_SEG: + if ( RING_FULL ) return 1; phys_device = (kdev_t) 0; sector_number = 0; + DISABLE_SCATTERGATHER(); break; case XEN_BLOCK_READ: case XEN_BLOCK_WRITE: phys_device = xldev_to_physdev(device); - /* Compute real buffer location on disk */ - sector_number = block_number; gd = xldev_to_gendisk(device); sector_number += gd->part[MINOR(device)].start_sect; + if ( (sg_operation == operation) && + (sg_dev == phys_device) && + (sg_next_sect == sector_number) ) + { + req = &blk_ring->ring[(req_prod-1)&(BLK_RING_SIZE-1)].req; + bh = (struct buffer_head *)id; + bh->b_reqnext = (struct buffer_head *)req->id; + req->id = id; + req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors; + if ( ++req->nr_segments < MAX_BLK_SEGS ) + sg_next_sect += nr_sectors; + else + DISABLE_SCATTERGATHER(); + return 0; + } + else if ( RING_FULL ) + { + return 1; + } + else + { + sg_operation = operation; + sg_dev = phys_device; + sg_next_sect = sector_number + nr_sectors; + } break; default: @@ -303,16 +332,14 @@ static int hypervisor_request(void * id, } /* Fill out a communications ring structure. */ - position = blk_ring->req_prod; - blk_ring->ring[position].req.id = id; - blk_ring->ring[position].req.operation = operation; - blk_ring->ring[position].req.buffer = buffer_ma; - blk_ring->ring[position].req.block_number = block_number; - blk_ring->ring[position].req.block_size = block_size; - blk_ring->ring[position].req.device = phys_device; - blk_ring->ring[position].req.sector_number = sector_number; - - blk_ring->req_prod = BLK_RING_INC(position); + req = &blk_ring->ring[req_prod].req; + req->id = id; + req->operation = operation; + req->sector_number = sector_number; + req->device = phys_device; + req->nr_segments = 1; + req->buffer_and_sects[0] = buffer_ma | nr_sectors; + req_prod = BLK_RING_INC(req_prod); return 0; } @@ -325,7 +352,7 @@ static int hypervisor_request(void * id, void do_xlblk_request(request_queue_t *rq) { struct request *req; - struct buffer_head *bh; + struct buffer_head *bh, *next_bh; int rw, nsect, full, queued = 0; DPRINTK("xlblk.c::do_xlblk_request for '%s'\n", DEVICE_NAME); @@ -349,12 +376,17 @@ void do_xlblk_request(request_queue_t *rq) bh = req->bh; while ( bh != NULL ) { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + full = hypervisor_request( - bh, (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, - bh->b_data, bh->b_rsector, bh->b_size, bh->b_dev); + (unsigned long)bh, + (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, + bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev); if ( full ) { + bh->b_reqnext = next_bh; pending_queues[nr_pending++] = rq; if ( nr_pending >= MAX_PENDING ) BUG(); goto out; @@ -364,9 +396,7 @@ void do_xlblk_request(request_queue_t *rq) /* Dequeue the buffer head from the request. */ nsect = bh->b_size >> 9; - req->bh = bh->b_reqnext; - bh->b_reqnext = NULL; - bh = req->bh; + bh = req->bh = next_bh; if ( bh != NULL ) { @@ -389,7 +419,7 @@ void do_xlblk_request(request_queue_t *rq) } out: - if ( queued != 0 ) HYPERVISOR_block_io_op(); + if ( queued != 0 ) signal_requests_to_xen(); } @@ -397,7 +427,7 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) { int i; unsigned long flags; - struct buffer_head *bh; + struct buffer_head *bh, *next_bh; spin_lock_irqsave(&io_request_lock, flags); @@ -410,7 +440,14 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) { case XEN_BLOCK_READ: case XEN_BLOCK_WRITE: - if ( (bh = bret->id) != NULL ) bh->b_end_io(bh, 1); + for ( bh = (struct buffer_head *)bret->id; + bh != NULL; + bh = next_bh ) + { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, 1); + } break; case XEN_BLOCK_SEG_CREATE: @@ -429,7 +466,7 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) /* We kick pending request queues if the ring is reasonably empty. */ if ( (nr_pending != 0) && - (((blk_ring->req_prod - resp_cons) & (BLK_RING_SIZE - 1)) < + (((req_prod - resp_cons) & (BLK_RING_SIZE - 1)) < (BLK_RING_SIZE >> 1)) ) { /* Attempt to drain the queue, but bail if the ring becomes full. */ @@ -445,13 +482,27 @@ static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) /* Send a synchronous message to Xen. */ -int xenolinux_control_msg(int operation, char *buffer) +int xenolinux_control_msg(int operation, char *buffer, int size) { - xlblk_control_msg_pending = 1; barrier(); - if ( hypervisor_request(NULL, operation, buffer, 0, 0, 0) ) + unsigned long flags; + char *aligned_buf; + + /* We copy from an aligned buffer, as interface needs sector alignment. */ + aligned_buf = get_free_page(GFP_KERNEL); + if ( aligned_buf == NULL ) BUG(); + + xlblk_control_msg_pending = 1; + spin_lock_irqsave(&io_request_lock, flags); + /* Note that size gets rounded up to a sector-sized boundary. */ + if ( hypervisor_request(0, operation, aligned_buf, 0, (size+511)/512, 0) ) return -EAGAIN; - HYPERVISOR_block_io_op(); - while ( xlblk_control_msg_pending ) barrier(); + signal_requests_to_xen(); + spin_unlock_irqrestore(&io_request_lock, flags); + while ( xlblk_control_msg_pending ) barrier(); + + memcpy(buffer, aligned_buf, size); + free_page(aligned_buf); + return 0; } @@ -465,7 +516,7 @@ int __init xlblk_init(void) /* This mapping was created early at boot time. */ blk_ring = (blk_ring_t *)fix_to_virt(FIX_BLKRING_BASE); - blk_ring->req_prod = blk_ring->resp_prod = resp_cons = 0; + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; error = request_irq(XLBLK_RESPONSE_IRQ, xlblk_response_int, 0, "xlblk-response", NULL); @@ -478,7 +529,8 @@ int __init xlblk_init(void) /* Probe for disk information. */ memset(&xlblk_disk_info, 0, sizeof(xlblk_disk_info)); error = xenolinux_control_msg(XEN_BLOCK_PROBE_BLK, - (char *)&xlblk_disk_info); + (char *)&xlblk_disk_info, + sizeof(xen_disk_info_t)); if ( error ) { printk(KERN_ALERT "Could not probe disks (%d)\n", error); diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h index 17ca09d9f2..42da335c4a 100644 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_block.h @@ -46,7 +46,7 @@ typedef struct xl_disk { } xl_disk_t; /* Generic layer. */ -extern int xenolinux_control_msg(int operration, char *buffer); +extern int xenolinux_control_msg(int operration, char *buffer, int size); extern int xenolinux_block_open(struct inode *inode, struct file *filep); extern int xenolinux_block_release(struct inode *inode, struct file *filep); extern int xenolinux_block_ioctl(struct inode *inode, struct file *filep, diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c index da2086e22c..97a53fe6fd 100644 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment.c @@ -51,7 +51,7 @@ int __init xlseg_init(void) /* Probe for disk information. */ memset(xdi, 0, sizeof(*xdi)); - xenolinux_control_msg(XEN_BLOCK_PROBE_SEG, (char *)xdi); + xenolinux_control_msg(XEN_BLOCK_PROBE_SEG, (char *)xdi, sizeof(*xdi)); DPRINTK("vhd block device probe:\n"); for ( i = 0; i < xdi->count; i++ ) diff --git a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c index e7c121b683..3149be747b 100644 --- a/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c +++ b/xenolinux-2.4.21-pre4-sparse/arch/xeno/drivers/block/xl_segment_proc.c @@ -210,7 +210,7 @@ static int proc_write_vhd(struct file *file, const char *buffer, xvd.extents[loop].size = to_number(string); } - xenolinux_control_msg(XEN_BLOCK_SEG_CREATE, (char *)&xvd); + xenolinux_control_msg(XEN_BLOCK_SEG_CREATE, (char *)&xvd, sizeof(xvd)); return count; } -- cgit v1.2.3