diff options
Diffstat (limited to 'linux-2.4.26-xen-sparse/arch/xen/drivers')
36 files changed, 8884 insertions, 0 deletions
diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/Makefile new file mode 100644 index 0000000000..9fb2227978 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := balloon.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c new file mode 100644 index 0000000000..56237380f3 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/balloon/balloon.c @@ -0,0 +1,274 @@ +/****************************************************************************** + * balloon.c + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <asm/xen_proc.h> + +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/smp_lock.h> +#include <linux/pagemap.h> + +#include <asm/hypervisor.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> + +/* USER DEFINES -- THESE SHOULD BE COPIED TO USER-SPACE TOOLS */ +#define USER_INFLATE_BALLOON 1 /* return mem to hypervisor */ +#define USER_DEFLATE_BALLOON 2 /* claim mem from hypervisor */ +typedef struct user_balloon_op { + unsigned int op; + unsigned long size; +} user_balloon_op_t; +/* END OF USER DEFINE */ + +/* Dead entry written into ballon-owned entries in the PMT. */ +#define DEAD 0xdeadbeef + +static struct proc_dir_entry *balloon_pde; +unsigned long credit; + +static inline pte_t *get_ptep(unsigned long addr) +{ + pgd_t *pgd; pmd_t *pmd; pte_t *ptep; + pgd = pgd_offset_k(addr); + + if ( pgd_none(*pgd) || pgd_bad(*pgd) ) BUG(); + + pmd = pmd_offset(pgd, addr); + if ( pmd_none(*pmd) || pmd_bad(*pmd) ) BUG(); + + ptep = pte_offset(pmd, addr); + + return ptep; +} + +/* main function for relinquishing bit of memory */ +static unsigned long inflate_balloon(unsigned long num_pages) +{ + unsigned long *parray; + unsigned long *currp; + unsigned long curraddr; + unsigned long ret = 0; + unsigned long vaddr; + unsigned long i, j; + + parray = (unsigned long *)kmalloc(num_pages * sizeof(unsigned long), + GFP_KERNEL); + currp = parray; + + for ( i = 0; i < num_pages; i++ ) + { + /* Try to obtain a free page (has to be done with GFP_ATOMIC). */ + vaddr = __get_free_page(GFP_ATOMIC); + + /* If allocation fails then free all reserved pages. */ + if ( vaddr == 0 ) + { + printk("Unable to inflate balloon by %ld, only %ld pages free.", + num_pages, i); + currp = parray; + for(j = 0; j < i; j++){ + free_page(*currp++); + } + goto cleanup; + } + + *currp++ = vaddr; + } + + + currp = parray; + for ( i = 0; i < num_pages; i++ ) + { + curraddr = *currp; + *currp = virt_to_machine(*currp) >> PAGE_SHIFT; + queue_l1_entry_update(get_ptep(curraddr), 0); + phys_to_machine_mapping[__pa(curraddr) >> PAGE_SHIFT] = DEAD; + currp++; + } + + XEN_flush_page_update_queue(); + + ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, + parray, num_pages); + if ( unlikely(ret != num_pages) ) + { + printk("Unable to inflate balloon, error %lx\n", ret); + goto cleanup; + } + + credit += num_pages; + ret = num_pages; + + cleanup: + kfree(parray); + + return ret; +} + +/* install new mem pages obtained by deflate_balloon. function walks + * phys->machine mapping table looking for DEAD entries and populates + * them. + */ +static unsigned long process_new_pages(unsigned long * parray, + unsigned long num) +{ + /* currently, this function is rather simplistic as + * it is assumed that domain reclaims only number of + * pages previously released. this is to change soon + * and the code to extend page tables etc. will be + * incorporated here. + */ + + unsigned long tot_pages = start_info.nr_pages; + unsigned long * curr = parray; + unsigned long num_installed; + unsigned long i; + + num_installed = 0; + for ( i = 0; (i < tot_pages) && (num_installed < num); i++ ) + { + if ( phys_to_machine_mapping[i] == DEAD ) + { + phys_to_machine_mapping[i] = *curr; + queue_l1_entry_update( + (pte_t *)((i << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE), i); + queue_l1_entry_update( + get_ptep((unsigned long)__va(i << PAGE_SHIFT)), + ((*curr) << PAGE_SHIFT) | pgprot_val(PAGE_KERNEL)); + + *curr = (unsigned long)__va(i << PAGE_SHIFT); + curr++; + num_installed++; + } + } + + /* now, this is tricky (and will also change for machine addrs that + * are mapped to not previously released addresses). we free pages + * that were allocated by get_free_page (the mappings are different + * now, of course). + */ + curr = parray; + for ( i = 0; i < num_installed; i++ ) + { + free_page(*curr); + curr++; + } + + return num_installed; +} + +unsigned long deflate_balloon(unsigned long num_pages) +{ + unsigned long ret; + unsigned long * parray; + + if ( num_pages > credit ) + { + printk("Can not allocate more pages than previously released.\n"); + return -EAGAIN; + } + + parray = (unsigned long *)kmalloc(num_pages * sizeof(unsigned long), + GFP_KERNEL); + + ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation, + parray, num_pages); + if ( unlikely(ret != num_pages) ) + { + printk("Unable to deflate balloon, error %lx\n", ret); + goto cleanup; + } + + if ( (ret = process_new_pages(parray, num_pages)) < num_pages ) + { + printk("Unable to deflate balloon by specified %lx pages, only %lx.\n", + num_pages, ret); + goto cleanup; + } + + ret = num_pages; + credit -= num_pages; + + cleanup: + kfree(parray); + + return ret; +} + +static int balloon_write(struct file *file, const char *buffer, + u_long count, void *data) +{ + user_balloon_op_t bop; + + /* Only admin can play with the balloon :) */ + if ( !capable(CAP_SYS_ADMIN) ) + return -EPERM; + + if ( copy_from_user(&bop, buffer, sizeof(bop)) ) + return -EFAULT; + + switch ( bop.op ) + { + case USER_INFLATE_BALLOON: + if ( inflate_balloon(bop.size) < bop.size ) + return -EAGAIN; + break; + + case USER_DEFLATE_BALLOON: + deflate_balloon(bop.size); + break; + + default: + printk("Unknown command to balloon driver."); + return -EFAULT; + } + + return sizeof(bop); +} + +/* + * main balloon driver initialization function. + */ +static int __init init_module(void) +{ + printk(KERN_ALERT "Starting Xen Balloon driver\n"); + + credit = 0; + + if ( (balloon_pde = create_xen_proc_entry("balloon", 0600)) == NULL ) + { + printk(KERN_ALERT "Unable to create balloon driver proc entry!"); + return -1; + } + + balloon_pde->write_proc = balloon_write; + + return 0; +} + +static void __exit cleanup_module(void) +{ + if ( balloon_pde != NULL ) + { + remove_xen_proc_entry("balloon"); + balloon_pde = NULL; + } +} + +module_init(init_module); +module_exit(cleanup_module); + + diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/Makefile new file mode 100644 index 0000000000..20c8192d3d --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/Makefile @@ -0,0 +1,10 @@ + +O_TARGET := drv.o + +subdir-y += frontend +obj-y += frontend/drv.o + +subdir-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend +obj-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend/drv.o + +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile new file mode 100644 index 0000000000..4c8c17367c --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := main.o control.o interface.o vbd.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h new file mode 100644 index 0000000000..d9f1d22908 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/common.h @@ -0,0 +1,108 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/common.h + */ + +#ifndef __BLKIF__BACKEND__COMMON_H__ +#define __BLKIF__BACKEND__COMMON_H__ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/rbtree.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <asm/ctrl_if.h> +#include <asm/io.h> +#include "../blkif.h" + +#ifndef NDEBUG +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +typedef struct blkif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned long shmem_frame; + unsigned int evtchn; + int irq; + /* Comms information. */ + blkif_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */ + BLK_RING_IDX blk_req_cons; /* Request consumer. */ + BLK_RING_IDX blk_resp_prod; /* Private version of response producer. */ + /* VBDs attached to this interface. */ + rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */ + spinlock_t vbd_lock; /* Protects VBD mapping. */ + /* Private fields. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + /* + * DISCONNECT response is deferred until pending requests are ack'ed. + * We therefore need to store the id from the original request. + */ + u8 disconnect_rspid; + struct blkif_st *hash_next; + struct list_head blkdev_list; + spinlock_t blk_ring_lock; + atomic_t refcnt; +} blkif_t; + +void blkif_create(blkif_be_create_t *create); +void blkif_destroy(blkif_be_destroy_t *destroy); +void blkif_connect(blkif_be_connect_t *connect); +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id); +void __blkif_disconnect_complete(blkif_t *blkif); +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); +#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define blkif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + __blkif_disconnect_complete(_b); \ + } while (0) + +/* An entry in a list of xen_extents. */ +typedef struct _blkif_extent_le { + blkif_extent_t extent; /* an individual extent */ + struct _blkif_extent_le *next; /* and a pointer to the next */ +} blkif_extent_le_t; + +typedef struct _vbd { + blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* XD_TYPE_xxx */ + blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */ + rb_node_t rb; /* for linking into R-B tree lookup struct */ +} vbd_t; + +void vbd_create(blkif_be_vbd_create_t *create); +void vbd_grow(blkif_be_vbd_grow_t *grow); +void vbd_shrink(blkif_be_vbd_shrink_t *shrink); +void vbd_destroy(blkif_be_vbd_destroy_t *delete); +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds); +void destroy_all_vbds(blkif_t *blkif); + +/* Describes a [partial] disk extent (part of a block io request) */ +typedef struct { + unsigned short dev; + unsigned short nr_sects; + unsigned long buffer; + xen_sector_t sector_number; +} phys_seg_t; + +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); + +void blkif_interface_init(void); +void blkif_ctrlif_init(void); + +void blkif_deschedule(blkif_t *blkif); + +void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __BLKIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c new file mode 100644 index 0000000000..0b26224651 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/control.c @@ -0,0 +1,87 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/control.c + * + * Routines for interfacing with the control plane. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype); + + switch ( msg->subtype ) + { + case CMSG_BLKIF_BE_CREATE: + if ( msg->length != sizeof(blkif_be_create_t) ) + goto parse_error; + blkif_create((blkif_be_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DESTROY: + if ( msg->length != sizeof(blkif_be_destroy_t) ) + goto parse_error; + blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_CONNECT: + if ( msg->length != sizeof(blkif_be_connect_t) ) + goto parse_error; + blkif_connect((blkif_be_connect_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_DISCONNECT: + if ( msg->length != sizeof(blkif_be_disconnect_t) ) + goto parse_error; + if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) ) + return; /* Sending the response is deferred until later. */ + break; + case CMSG_BLKIF_BE_VBD_CREATE: + if ( msg->length != sizeof(blkif_be_vbd_create_t) ) + goto parse_error; + vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_DESTROY: + if ( msg->length != sizeof(blkif_be_vbd_destroy_t) ) + goto parse_error; + vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_GROW: + if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) + goto parse_error; + vbd_grow((blkif_be_vbd_grow_t *)&msg->msg[0]); + break; + case CMSG_BLKIF_BE_VBD_SHRINK: + if ( msg->length != sizeof(blkif_be_vbd_shrink_t) ) + goto parse_error; + vbd_shrink((blkif_be_vbd_shrink_t *)&msg->msg[0]); + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + DPRINTK("Parse error while reading message subtype %d, len %d\n", + msg->subtype, msg->length); + msg->length = 0; + ctrl_if_send_response(msg); +} + +void blkif_ctrlif_init(void) +{ + ctrl_msg_t cmsg; + blkif_be_driver_status_changed_t st; + + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(blkif_be_driver_status_changed_t); + st.status = BLKIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c new file mode 100644 index 0000000000..780d793c6c --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/interface.c @@ -0,0 +1,233 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/interface.c + * + * Block-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) + +static kmem_cache_t *blkif_cachep; +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + ((blkif->domid != domid) || (blkif->handle != handle)) ) + blkif = blkif->hash_next; + return blkif; +} + +void __blkif_disconnect_complete(blkif_t *blkif) +{ + ctrl_msg_t cmsg; + blkif_be_disconnect_t disc; + + /* + * These can't be done in __blkif_disconnect() because at that point there + * may be outstanding requests at the disc whose asynchronous responses + * must still be notified to the remote driver. + */ + unbind_evtchn_from_irq(blkif->evtchn); + vfree(blkif->blk_ring_base); + + /* Construct the deferred response message. */ + cmsg.type = CMSG_BLKIF_BE; + cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; + cmsg.id = blkif->disconnect_rspid; + cmsg.length = sizeof(blkif_be_disconnect_t); + disc.domid = blkif->domid; + disc.blkif_handle = blkif->handle; + disc.status = BLKIF_BE_STATUS_OKAY; + memcpy(cmsg.msg, &disc, sizeof(disc)); + + /* + * Make sure message is constructed /before/ status change, because + * after the status change the 'blkif' structure could be deallocated at + * any time. Also make sure we send the response /after/ status change, + * as otherwise a subsequent CONNECT request could spuriously fail if + * another CPU doesn't see the status change yet. + */ + mb(); + if ( blkif->status != DISCONNECTING ) + BUG(); + blkif->status = DISCONNECTED; + mb(); + + /* Send the successful response. */ + ctrl_if_send_response(&cmsg); +} + +void blkif_create(blkif_be_create_t *create) +{ + domid_t domid = create->domid; + unsigned int handle = create->blkif_handle; + blkif_t **pblkif, *blkif; + + if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) + { + DPRINTK("Could not create blkif: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->handle = handle; + blkif->status = DISCONNECTED; + spin_lock_init(&blkif->vbd_lock); + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 0); + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + { + DPRINTK("Could not create blkif: already exists\n"); + create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; + kmem_cache_free(blkif_cachep, blkif); + return; + } + pblkif = &(*pblkif)->hash_next; + } + + blkif->hash_next = *pblkif; + *pblkif = blkif; + + DPRINTK("Successfully created blkif\n"); + create->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_destroy(blkif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->blkif_handle; + blkif_t **pblkif, *blkif; + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif = *pblkif) != NULL ) + { + if ( (blkif->domid == domid) && (blkif->handle == handle) ) + { + if ( blkif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pblkif = &blkif->hash_next; + } + + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pblkif = blkif->hash_next; + destroy_all_vbds(blkif); + kmem_cache_free(blkif_cachep, blkif); + destroy->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_connect(blkif_be_connect_t *connect) +{ + domid_t domid = connect->domid; + unsigned int handle = connect->blkif_handle; + unsigned int evtchn = connect->evtchn; + unsigned long shmem_frame = connect->shmem_frame; + struct vm_struct *vma; + pgprot_t prot; + int error; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", + connect->domid, connect->blkif_handle); + connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) + { + connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); + error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), + shmem_frame<<PAGE_SHIFT, PAGE_SIZE, + prot, domid); + if ( error != 0 ) + { + if ( error == -ENOMEM ) + connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + else if ( error == -EFAULT ) + connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; + else + connect->status = BLKIF_BE_STATUS_ERROR; + vfree(vma->addr); + return; + } + + if ( blkif->status != DISCONNECTED ) + { + connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + vfree(vma->addr); + return; + } + + blkif->evtchn = evtchn; + blkif->irq = bind_evtchn_to_irq(evtchn); + blkif->shmem_frame = shmem_frame; + blkif->blk_ring_base = (blkif_ring_t *)vma->addr; + blkif->status = CONNECTED; + blkif_get(blkif); + + request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif); + + connect->status = BLKIF_BE_STATUS_OKAY; +} + +int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id) +{ + domid_t domid = disconnect->domid; + unsigned int handle = disconnect->blkif_handle; + blkif_t *blkif; + + blkif = blkif_find_by_handle(domid, handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("blkif_disconnect attempted for non-existent blkif" + " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); + disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return 1; /* Caller will send response error message. */ + } + + if ( blkif->status == CONNECTED ) + { + blkif->status = DISCONNECTING; + blkif->disconnect_rspid = rsp_id; + wmb(); /* Let other CPUs see the status change. */ + free_irq(blkif->irq, NULL); + blkif_deschedule(blkif); + blkif_put(blkif); + } + + return 0; /* Caller should not send response message. */ +} + +void __init blkif_interface_init(void) +{ + blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), + 0, 0, NULL, NULL); + memset(blkif_hash, 0, sizeof(blkif_hash)); +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c new file mode 100644 index 0000000000..803af976d2 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/main.c @@ -0,0 +1,523 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/blkif/frontend + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW ** + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +#define MAX_PENDING_REQS 64 +#define BATCH_PER_DOMAIN 16 + +/* + * NB. We place a page of padding between each buffer page to avoid incorrect + * merging of requests by the IDE and SCSI merging routines. Otherwise, two + * adjacent buffers in a scatter-gather request would have adjacent page + * numbers: since the merge routines don't realise that this is in *pseudophys* + * space, not real space, they may collapse the s-g elements! + */ +static unsigned long mmap_vstart; +#define MMAP_PAGES_PER_REQUEST \ + (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)) +#define MMAP_PAGES \ + (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) +#define MMAP_VADDR(_req,_seg) \ + (mmap_vstart + \ + ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * 2 * PAGE_SIZE)) + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct { + blkif_t *blkif; + unsigned long id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; +} pending_req_t; + +/* + * We can't allocate pending_req's in order, since they may complete out of + * order. We therefore maintain an allocation ring. This ring also indicates + * when enough work has been passed down -- at that point the allocation ring + * will be empty. + */ +static pending_req_t pending_reqs[MAX_PENDING_REQS]; +static unsigned char pending_ring[MAX_PENDING_REQS]; +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; +/* NB. We use a different index type to differentiate from shared blk rings. */ +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +static kmem_cache_t *buffer_head_cachep; + +static int do_block_io_op(blkif_t *blkif, int max_to_do); +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req); +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req); +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st); + +static void fast_flush_area(int idx, int nr_pages) +{ + multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; + int i; + + for ( i = 0; i < nr_pages; i++ ) + { + mcl[i].op = __HYPERVISOR_update_va_mapping; + mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT; + mcl[i].args[1] = 0; + mcl[i].args[2] = 0; + } + + mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB; + (void)HYPERVISOR_multicall(mcl, nr_pages); +} + + +/****************************************************************** + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE + */ + +static struct list_head io_schedule_list; +static spinlock_t io_schedule_list_lock; + +static int __on_blkdev_list(blkif_t *blkif) +{ + return blkif->blkdev_list.next != NULL; +} + +static void remove_from_blkdev_list(blkif_t *blkif) +{ + unsigned long flags; + if ( !__on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&io_schedule_list_lock, flags); + if ( __on_blkdev_list(blkif) ) + { + list_del(&blkif->blkdev_list); + blkif->blkdev_list.next = NULL; + blkif_put(blkif); + } + spin_unlock_irqrestore(&io_schedule_list_lock, flags); +} + +static void add_to_blkdev_list_tail(blkif_t *blkif) +{ + unsigned long flags; + if ( __on_blkdev_list(blkif) ) return; + spin_lock_irqsave(&io_schedule_list_lock, flags); + if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) + { + list_add_tail(&blkif->blkdev_list, &io_schedule_list); + blkif_get(blkif); + } + spin_unlock_irqrestore(&io_schedule_list_lock, flags); +} + + +/****************************************************************** + * SCHEDULER FUNCTIONS + */ + +static void io_schedule(unsigned long unused) +{ + blkif_t *blkif; + struct list_head *ent; + + /* Queue up a batch of requests. */ + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&io_schedule_list) ) + { + ent = io_schedule_list.next; + blkif = list_entry(ent, blkif_t, blkdev_list); + blkif_get(blkif); + remove_from_blkdev_list(blkif); + if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) + add_to_blkdev_list_tail(blkif); + blkif_put(blkif); + } + + /* Push the batch through to disc. */ + run_task_queue(&tq_disk); +} + +static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0); + +static void maybe_trigger_io_schedule(void) +{ + /* + * Needed so that two processes, who together make the following predicate + * true, don't both read stale values and evaluate the predicate + * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... + */ + smp_mb(); + + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&io_schedule_list) ) + tasklet_schedule(&io_schedule_tasklet); +} + + + +/****************************************************************** + * COMPLETION CALLBACK -- Called as bh->b_end_io() + */ + +static void __end_block_io_op(pending_req_t *pending_req, int uptodate) +{ + unsigned long flags; + + /* An error fails the entire request. */ + if ( !uptodate ) + { + DPRINTK("Buffer not up-to-date at end of operation\n"); + pending_req->status = BLKIF_RSP_ERROR; + } + + if ( atomic_dec_and_test(&pending_req->pendcnt) ) + { + int pending_idx = pending_req - pending_reqs; + fast_flush_area(pending_idx, pending_req->nr_pages); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + spin_lock_irqsave(&pend_prod_lock, flags); + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + spin_unlock_irqrestore(&pend_prod_lock, flags); + maybe_trigger_io_schedule(); + } +} + +static void end_block_io_op(struct buffer_head *bh, int uptodate) +{ + __end_block_io_op(bh->b_private, uptodate); + kmem_cache_free(buffer_head_cachep, bh); +} + + + +/****************************************************************************** + * NOTIFICATION FROM GUEST OS. + */ + +void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + blkif_t *blkif = dev_id; + add_to_blkdev_list_tail(blkif); + maybe_trigger_io_schedule(); +} + + + +/****************************************************************** + * DOWNWARD CALLS -- These interface with the block-device layer proper. + */ + +static int do_block_io_op(blkif_t *blkif, int max_to_do) +{ + blkif_ring_t *blk_ring = blkif->blk_ring_base; + blkif_request_t *req; + BLK_RING_IDX i; + int more_to_do = 0; + + /* Take items off the comms ring, taking care not to overflow. */ + for ( i = blkif->blk_req_cons; + (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != + BLK_RING_SIZE); + i++ ) + { + if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) + { + more_to_do = 1; + break; + } + + req = &blk_ring->ring[MASK_BLK_IDX(i)].req; + switch ( req->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + dispatch_rw_block_io(blkif, req); + break; + + case BLKIF_OP_PROBE: + dispatch_probe(blkif, req); + break; + + default: + DPRINTK("error: unknown block io operation [%d]\n", + blk_ring->ring[i].req.operation); + make_response(blkif, blk_ring->ring[i].req.id, + blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR); + break; + } + } + + blkif->blk_req_cons = i; + return more_to_do; +} + +static void dispatch_probe(blkif_t *blkif, blkif_request_t *req) +{ + int rsp = BLKIF_RSP_ERROR; + int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + /* We expect one buffer only. */ + if ( unlikely(req->nr_segments != 1) ) + goto out; + + /* Make sure the buffer is page-sized. */ + if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || + (blkif_last_sect(req->frame_and_sects[0]) != 7) ) + goto out; + + if ( HYPERVISOR_update_va_mapping_otherdomain( + MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT, + (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, + 0, blkif->domid) ) + goto out; + + rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), + PAGE_SIZE / sizeof(vdisk_t)); + + out: + fast_flush_area(pending_idx, 1); + make_response(blkif, req->id, req->operation, rsp); +} + +static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) +{ + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); + struct buffer_head *bh; + int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; + short nr_sects; + unsigned long buffer, fas; + int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + pending_req_t *pending_req; + unsigned long remap_prot; + multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; + + /* We map virtual scatter/gather segments to physical segments. */ + int new_segs, nr_psegs = 0; + phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1]; + + /* Check that number of segments is sane. */ + if ( unlikely(req->nr_segments == 0) || + unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) + { + DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); + goto bad_descriptor; + } + + /* + * Check each address/size pair is sane, and convert into a + * physical device and block offset. Note that if the offset and size + * crosses a virtual extent boundary, we may end up with more + * physical scatter/gather segments than virtual segments. + */ + for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) + { + fas = req->frame_and_sects[i]; + buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); + nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; + + if ( nr_sects <= 0 ) + goto bad_descriptor; + + phys_seg[nr_psegs].dev = req->device; + phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; + phys_seg[nr_psegs].buffer = buffer; + phys_seg[nr_psegs].nr_sects = nr_sects; + + /* Translate the request into the relevant 'physical device' */ + new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation); + if ( new_segs < 0 ) + { + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + req->sector_number + tot_sects, + req->sector_number + tot_sects + nr_sects, + req->device); + goto bad_descriptor; + } + + nr_psegs += new_segs; + ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1)); + } + + /* Nonsensical zero-sized request? */ + if ( unlikely(nr_psegs == 0) ) + goto bad_descriptor; + + if ( operation == READ ) + remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW; + else + remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED; + + for ( i = 0; i < nr_psegs; i++ ) + { + mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT; + mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot; + mcl[i].args[2] = 0; + mcl[i].args[3] = blkif->domid; + + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = + phys_seg[i].buffer >> PAGE_SHIFT; + } + + (void)HYPERVISOR_multicall(mcl, nr_psegs); + + for ( i = 0; i < nr_psegs; i++ ) + { + if ( unlikely(mcl[i].args[5] != 0) ) + { + DPRINTK("invalid buffer -- could not remap it\n"); + fast_flush_area(pending_idx, nr_psegs); + goto bad_descriptor; + } + } + + pending_req = &pending_reqs[pending_idx]; + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nr_psegs; + atomic_set(&pending_req->pendcnt, nr_psegs); + pending_cons++; + + blkif_get(blkif); + + /* Now we pass each segment down to the real blkdev layer. */ + for ( i = 0; i < nr_psegs; i++ ) + { + bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC); + if ( unlikely(bh == NULL) ) + { + __end_block_io_op(pending_req, 0); + continue; + } + memset(bh, 0, sizeof (struct buffer_head)); + + init_waitqueue_head(&bh->b_wait); + bh->b_size = phys_seg[i].nr_sects << 9; + bh->b_dev = phys_seg[i].dev; + bh->b_rdev = phys_seg[i].dev; + bh->b_rsector = (unsigned long)phys_seg[i].sector_number; + bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + + (phys_seg[i].buffer & ~PAGE_MASK); + bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); + bh->b_end_io = end_block_io_op; + bh->b_private = pending_req; + + bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | + (1 << BH_Req) | (1 << BH_Launder); + if ( operation == WRITE ) + bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); + + atomic_set(&bh->b_count, 1); + + /* Dispatch a single request. We'll flush it to disc later. */ + generic_make_request(operation, bh); + } + + return; + + bad_descriptor: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); +} + + + +/****************************************************************** + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING + */ + + +static void make_response(blkif_t *blkif, unsigned long id, + unsigned short op, int st) +{ + blkif_response_t *resp; + unsigned long flags; + + /* Place on the response ring for the relevant domain. */ + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + resp = &blkif->blk_ring_base-> + ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp; + resp->id = id; + resp->operation = op; + resp->status = st; + wmb(); + blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod; + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + /* Kick the relevant domain. */ + notify_via_evtchn(blkif->evtchn); +} + +void blkif_deschedule(blkif_t *blkif) +{ + remove_from_blkdev_list(blkif); +} + +static int __init init_module(void) +{ + int i; + + if ( !(start_info.flags & SIF_INITDOMAIN) + && !(start_info.flags & SIF_BLK_BE_DOMAIN) ) + return 0; + + blkif_interface_init(); + + if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) + BUG(); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + memset(pending_reqs, 0, sizeof(pending_reqs)); + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + spin_lock_init(&io_schedule_list_lock); + INIT_LIST_HEAD(&io_schedule_list); + + buffer_head_cachep = kmem_cache_create( + "buffer_head_cache", sizeof(struct buffer_head), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + + blkif_ctrlif_init(); + + return 0; +} + +static void cleanup_module(void) +{ + BUG(); +} + +module_init(init_module); +module_exit(cleanup_module); diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c new file mode 100644 index 0000000000..6704fbb541 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/backend/vbd.c @@ -0,0 +1,436 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/backend/vbd.c + * + * Routines for managing virtual block devices (VBDs). + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + */ + +#include "common.h" + +void vbd_create(blkif_be_vbd_create_t *create) +{ + vbd_t *vbd; + rb_node_t **rb_p, *rb_parent = NULL; + blkif_t *blkif; + blkif_vdev_t vdevice = create->vdevice; + + blkif = blkif_find_by_handle(create->domid, create->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", + create->domid, create->blkif_handle); + create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb_p = &blkif->vbd_rb.rb_node; + while ( *rb_p != NULL ) + { + rb_parent = *rb_p; + vbd = rb_entry(rb_parent, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + { + rb_p = &rb_parent->rb_left; + } + else if ( vdevice > vbd->vdevice ) + { + rb_p = &rb_parent->rb_right; + } + else + { + DPRINTK("vbd_create attempted for already existing vbd\n"); + create->status = BLKIF_BE_STATUS_VBD_EXISTS; + goto out; + } + } + + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_create: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + goto out; + } + + vbd->vdevice = vdevice; + vbd->readonly = create->readonly; + vbd->type = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; + vbd->extents = NULL; + + rb_link_node(&vbd->rb, rb_parent, rb_p); + rb_insert_color(&vbd->rb, &blkif->vbd_rb); + + DPRINTK("Successful creation of vdev=%04x (dom=%u)\n", + vdevice, create->domid); + create->status = BLKIF_BE_STATUS_OKAY; + + out: + spin_unlock(&blkif->vbd_lock); +} + + +/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */ +void vbd_grow(blkif_be_vbd_grow_t *grow) +{ + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = grow->vdevice; + + blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_grow attempted for non-existent blkif (%u,%u)\n", + grow->domid, grow->blkif_handle); + grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) + { + DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n"); + grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + goto out; + } + + if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), + GFP_KERNEL)) == NULL) ) + { + DPRINTK("vbd_grow: out of memory\n"); + grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + goto out; + } + + x->extent.device = grow->extent.device; + x->extent.sector_start = grow->extent.sector_start; + x->extent.sector_length = grow->extent.sector_length; + x->next = (blkif_extent_le_t *)NULL; + + for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) + continue; + + *px = x; + + DPRINTK("Successful grow of vdev=%04x (dom=%u)\n", + vdevice, grow->domid); + grow->status = BLKIF_BE_STATUS_OKAY; + + out: + spin_unlock(&blkif->vbd_lock); +} + + +void vbd_shrink(blkif_be_vbd_shrink_t *shrink) +{ + blkif_t *blkif; + blkif_extent_le_t **px, *x; + vbd_t *vbd = NULL; + rb_node_t *rb; + blkif_vdev_t vdevice = shrink->vdevice; + + blkif = blkif_find_by_handle(shrink->domid, shrink->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_shrink attempted for non-existent blkif (%u,%u)\n", + shrink->domid, shrink->blkif_handle); + shrink->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + break; + } + + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) + { + shrink->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + goto out; + } + + if ( unlikely(vbd->extents == NULL) ) + { + shrink->status = BLKIF_BE_STATUS_EXTENT_NOT_FOUND; + goto out; + } + + /* Find the last extent. We now know that there is at least one. */ + for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next ) + continue; + + x = *px; + *px = x->next; + kfree(x); + + shrink->status = BLKIF_BE_STATUS_OKAY; + + out: + spin_unlock(&blkif->vbd_lock); +} + + +void vbd_destroy(blkif_be_vbd_destroy_t *destroy) +{ + blkif_t *blkif; + vbd_t *vbd; + rb_node_t *rb; + blkif_extent_le_t *x, *t; + blkif_vdev_t vdevice = destroy->vdevice; + + blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); + if ( unlikely(blkif == NULL) ) + { + DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", + destroy->domid, destroy->blkif_handle); + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( vdevice < vbd->vdevice ) + rb = rb->rb_left; + else if ( vdevice > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + goto out; + + found: + rb_erase(rb, &blkif->vbd_rb); + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; + kfree(x); + x = t; + } + + out: + spin_unlock(&blkif->vbd_lock); +} + + +void destroy_all_vbds(blkif_t *blkif) +{ + vbd_t *vbd; + rb_node_t *rb; + blkif_extent_le_t *x, *t; + + spin_lock(&blkif->vbd_lock); + + while ( (rb = blkif->vbd_rb.rb_node) != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + + rb_erase(rb, &blkif->vbd_rb); + x = vbd->extents; + kfree(vbd); + + while ( x != NULL ) + { + t = x->next; + kfree(x); + x = t; + } + } + + spin_unlock(&blkif->vbd_lock); +} + + +static int vbd_probe_single(blkif_t *blkif, vdisk_t *vbd_info, vbd_t *vbd) +{ + blkif_extent_le_t *x; + + vbd_info->device = vbd->vdevice; + vbd_info->info = vbd->type; + if ( vbd->readonly ) + vbd_info->info |= VDISK_FLAG_RO; + vbd_info->capacity = 0ULL; + for ( x = vbd->extents; x != NULL; x = x->next ) + vbd_info->capacity += x->extent.sector_length; + + return 0; +} + + +int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds) +{ + int rc = 0, nr_vbds = 0; + rb_node_t *rb; + + spin_lock(&blkif->vbd_lock); + + if ( (rb = blkif->vbd_rb.rb_node) == NULL ) + goto out; + + new_subtree: + /* STEP 1. Find least node (it'll be left-most). */ + while ( rb->rb_left != NULL ) + rb = rb->rb_left; + + for ( ; ; ) + { + /* STEP 2. Dealt with left subtree. Now process current node. */ + if ( (rc = vbd_probe_single(blkif, &vbd_info[nr_vbds], + rb_entry(rb, vbd_t, rb))) != 0 ) + goto out; + if ( ++nr_vbds == max_vbds ) + goto out; + + /* STEP 3. Process right subtree, if any. */ + if ( rb->rb_right != NULL ) + { + rb = rb->rb_right; + goto new_subtree; + } + + /* STEP 4. Done both subtrees. Head back through ancesstors. */ + for ( ; ; ) + { + /* We're done when we get back to the root node. */ + if ( rb->rb_parent == NULL ) + goto out; + /* If we are left of parent, then parent is next to process. */ + if ( rb->rb_parent->rb_left == rb ) + break; + /* If we are right of parent, then we climb to grandparent. */ + rb = rb->rb_parent; + } + + rb = rb->rb_parent; + } + + out: + spin_unlock(&blkif->vbd_lock); + return (rc == 0) ? nr_vbds : rc; +} + + +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation) +{ + blkif_extent_le_t *x; + vbd_t *vbd; + rb_node_t *rb; + blkif_sector_t sec_off; + unsigned long nr_secs; + + spin_lock(&blkif->vbd_lock); + + rb = blkif->vbd_rb.rb_node; + while ( rb != NULL ) + { + vbd = rb_entry(rb, vbd_t, rb); + if ( pseg->dev < vbd->vdevice ) + rb = rb->rb_left; + else if ( pseg->dev > vbd->vdevice ) + rb = rb->rb_right; + else + goto found; + } + + DPRINTK("vbd_translate; domain %u attempted to access " + "non-existent VBD.\n", blkif->domid); + + spin_unlock(&blkif->vbd_lock); + return -ENODEV; + + found: + + if ( (operation == WRITE) && vbd->readonly ) + { + spin_unlock(&blkif->vbd_lock); + return -EACCES; + } + + /* + * Now iterate through the list of blkif_extents, working out which should + * be used to perform the translation. + */ + sec_off = pseg->sector_number; + nr_secs = pseg->nr_sects; + for ( x = vbd->extents; x != NULL; x = x->next ) + { + if ( sec_off < x->extent.sector_length ) + { + pseg->dev = x->extent.device; + pseg->sector_number = x->extent.sector_start + sec_off; + if ( unlikely((sec_off + nr_secs) > x->extent.sector_length) ) + goto overrun; + spin_unlock(&p->vbd_lock); + return 1; + } + sec_off -= x->extent.sector_length; + } + + DPRINTK("vbd_translate: end of vbd.\n"); + spin_unlock(&blkif->vbd_lock); + return -EACCES; + + /* + * Here we deal with overrun onto the following extent. We don't deal with + * overrun of more than one boundary since each request is restricted to + * 2^9 512-byte sectors, so it should be trivial for control software to + * ensure that extents are large enough to prevent excessive overrun. + */ + overrun: + + /* Adjust length of first chunk to run to end of first extent. */ + pseg[0].nr_sects = x->extent.sector_length - sec_off; + + /* Set second chunk buffer and length to start where first chunk ended. */ + pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9); + pseg[1].nr_sects = nr_secs - pseg[0].nr_sects; + + /* Now move to the next extent. Check it exists and is long enough! */ + if ( unlikely((x = x->next) == NULL) || + unlikely(x->extent.sector_length < pseg[1].nr_sects) ) + { + DPRINTK("vbd_translate: multiple overruns or end of vbd.\n"); + spin_unlock(&p->vbd_lock); + return -EACCES; + } + + /* Store the real device and start sector for the second chunk. */ + pseg[1].dev = x->extent.device; + pseg[1].sector_number = x->extent.sector_start; + + spin_unlock(&blkif->vbd_lock); + return 2; +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/blkif.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/blkif.h new file mode 100644 index 0000000000..1024629ea7 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/blkif.h @@ -0,0 +1,115 @@ +/****************************************************************************** + * blkif.h + * + * Unified block-device I/O interface for Xen guest OSes. + * + * Copyright (c) 2003-2004, Keir Fraser + */ + +#ifndef __SHARED_BLKIF_H__ +#define __SHARED_BLKIF_H__ + +#define blkif_vdev_t u16 +#define blkif_sector_t u64 + +#define BLKIF_OP_READ 0 +#define BLKIF_OP_WRITE 1 +#define BLKIF_OP_PROBE 2 + +/* NB. Ring size must be small enough for sizeof(blkif_ring_t) <= PAGE_SIZE. */ +#define BLKIF_RING_SIZE 64 + +/* + * Maximum scatter/gather segments per request. + * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. + * NB. This could be 12 if the ring indexes weren't stored in the same page. + */ +#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 + +typedef struct { + u8 operation; /* 0: BLKIF_OP_??? */ + u8 nr_segments; /* 1: number of segments */ + blkif_vdev_t device; /* 2: only for read/write requests */ + unsigned long id; /* 4: private guest value, echoed in resp */ + blkif_sector_t sector_number; /* start sector idx on disk (r/w only) */ + /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame. */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + /* @frame: machine page frame number. */ + unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +} PACKED blkif_request_t; + +#define blkif_first_sect(_fas) (((_fas)>>3)&7) +#define blkif_last_sect(_fas) ((_fas)&7) + +typedef struct { + unsigned long id; /* copied from request */ + u8 operation; /* copied from request */ + s16 status; /* BLKIF_RSP_??? */ +} PACKED blkif_response_t; + +#define BLKIF_RSP_ERROR -1 /* non-specific 'error' */ +#define BLKIF_RSP_OKAY 0 /* non-specific 'okay' */ + +/* + * We use a special capitalised type name because it is _essential_ that all + * arithmetic on indexes is done on an integer type of the correct size. + */ +typedef u32 BLKIF_RING_IDX; + +/* + * Ring indexes are 'free running'. That is, they are not stored modulo the + * size of the ring buffer. The following macro converts a free-running counter + * into a value that can directly index a ring-buffer array. + */ +#define MASK_BLKIF_IDX(_i) ((_i)&(BLKIF_RING_SIZE-1)) + +typedef struct { + BLKIF_RING_IDX req_prod; /* 0: Request producer. Updated by front-end. */ + BLKIF_RING_IDX resp_prod; /* 4: Response producer. Updated by back-end. */ + union { /* 8 */ + blkif_request_t req; + blkif_response_t resp; + } PACKED ring[BLKIF_RING_SIZE]; +} PACKED blkif_ring_t; + + +/* + * BLKIF_OP_PROBE: + * The request format for a probe request is constrained as follows: + * @operation == BLKIF_OP_PROBE + * @nr_segments == size of probe buffer in pages + * @device == unused (zero) + * @id == any value (echoed in response message) + * @sector_num == unused (zero) + * @frame_and_sects == list of page-sized buffers. + * (i.e., @first_sect == 0, @last_sect == 7). + * + * The response is a list of vdisk_t elements copied into the out-of-band + * probe buffer. On success the response status field contains the number + * of vdisk_t elements. + */ + +/* XXX SMH: Type values below are chosen to match ide_xxx in Linux ide.h. */ +#define VDISK_TYPE_FLOPPY 0x00 +#define VDISK_TYPE_TAPE 0x01 +#define VDISK_TYPE_CDROM 0x05 +#define VDISK_TYPE_OPTICAL 0x07 +#define VDISK_TYPE_DISK 0x20 + +#define VDISK_TYPE_MASK 0x3F +#define VDISK_TYPE(_x) ((_x) & VDISK_TYPE_MASK) + +/* The top two bits of the type field encode various flags. */ +#define VDISK_FLAG_RO 0x40 +#define VDISK_FLAG_VIRT 0x80 +#define VDISK_READONLY(_x) ((_x) & VDISK_FLAG_RO) +#define VDISK_VIRTUAL(_x) ((_x) & VDISK_FLAG_VIRT) + +typedef struct { + blkif_sector_t capacity; /* 0: Size in terms of 512-byte sectors. */ + blkif_vdev_t device; /* 8: Device number (opaque 16 bit value). */ + u16 info; /* 10: Device type and flags (VDISK_*). */ +} PACKED vdisk_t; /* 12 bytes */ + +#endif /* __SHARED_BLKIF_H__ */ diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile new file mode 100644 index 0000000000..b0d27cf698 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := main.o vbd.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/common.h new file mode 100644 index 0000000000..2d4415bdef --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/common.h @@ -0,0 +1,84 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/frontend/common.h + * + * Shared definitions between all levels of XenoLinux Virtual block devices. + */ + +#ifndef __XEN_DRIVERS_COMMON_H__ +#define __XEN_DRIVERS_COMMON_H__ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> + +#include <linux/fs.h> +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/major.h> + +#include <asm/hypervisor-ifs/hypervisor-if.h> +#include <asm/hypervisor-ifs/vbd.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/uaccess.h> + +#include "../blkif.h" + +#if 0 +#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a ) +#else +#define DPRINTK(_f, _a...) ((void)0) +#endif + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a ) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +/* Private gendisk->flags[] values. */ +#define GENHD_FL_XEN 2 /* Is unit a Xen block device? */ +#define GENHD_FL_VIRT_PARTNS 4 /* Are unit partitions virtual? */ + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. + * They hang in an array off the gendisk structure. We may end up putting + * all kinds of interesting stuff here :-) + */ +typedef struct xl_disk { + int usage; +} xl_disk_t; + +extern int blkif_open(struct inode *inode, struct file *filep); +extern int blkif_release(struct inode *inode, struct file *filep); +extern int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument); +extern int blkif_check(kdev_t dev); +extern int blkif_revalidate(kdev_t dev); +extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp); +extern void do_blkif_request (request_queue_t *rq); + +extern void xlvbd_update_vbds(void); + +static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev) +{ + struct gendisk *gd = get_gendisk(xldev); + + if ( gd == NULL ) + return NULL; + + return (xl_disk_t *)gd->real_devices + + (MINOR(xldev) >> gd->minor_shift); +} + + +/* Virtual block-device subsystem. */ +extern int xlvbd_init(void); +extern void xlvbd_cleanup(void); + +#endif /* __XEN_DRIVERS_COMMON_H__ */ diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c new file mode 100644 index 0000000000..4e5dbca093 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/main.c @@ -0,0 +1,814 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/frontend/main.c + * + * Xenolinux virtual block-device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + */ + +#include "common.h" +#include <linux/blk.h> +#include <linux/cdrom.h> +#include <linux/tqueue.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <scsi/scsi.h> +#include <asm/ctrl_if.h> + + + +typedef unsigned char byte; /* from linux/ide.h */ + +#define BLKIF_STATE_CLOSED 0 +#define BLKIF_STATE_DISCONNECTED 1 +#define BLKIF_STATE_CONNECTED 2 +static unsigned int blkif_state = BLKIF_STATE_CLOSED; +static unsigned int blkif_evtchn, blkif_irq; + +static int blkif_control_rsp_valid; +static blkif_response_t blkif_control_rsp; + +static blkif_ring_t *blk_ring; +static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */ +static BLK_RING_IDX req_prod; /* Private request producer. */ + + +static blkif_ring_t *blk_ring_rec; /* Private copy of requests, used for + * recovery. Responses not stored here. */ +static BLK_RING_IDX resp_cons_rec; /* Copy of response consumer, used for + * recovery */ +static int recovery = 0; /* "Recovery in progress" flag. Protected + * by the io_request_lock */ + + +/* We plug the I/O ring if the driver is suspended or if the ring is full. */ +#define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \ + (blkif_state != BLKIF_STATE_CONNECTED)) + + +/* + * Request queues with outstanding work, but ring is currently full. + * We need no special lock here, as we always access this with the + * io_request_lock held. We only need a small maximum list. + */ +#define MAX_PENDING 8 +static request_queue_t *pending_queues[MAX_PENDING]; +static int nr_pending; + +static kdev_t sg_dev; +static int sg_operation = -1; +static unsigned long sg_next_sect; +#define DISABLE_SCATTERGATHER() (sg_operation = -1) + +static inline void flush_requests(void) +{ + DISABLE_SCATTERGATHER(); + blk_ring->req_prod = req_prod; + notify_via_evtchn(blkif_evtchn); +} + + +/* + * blkif_update_int/update-vbds_task - handle VBD update events. + * Schedule a task for keventd to run, which will update the VBDs and perform + * the corresponding updates to our view of VBD state. + */ +static struct tq_struct update_tq; +static void update_vbds_task(void *unused) +{ + xlvbd_update_vbds(); +} + + +int blkif_open(struct inode *inode, struct file *filep) +{ + short xldev = inode->i_rdev; + struct gendisk *gd = get_gendisk(xldev); + xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); + short minor = MINOR(xldev); + + if ( gd->part[minor].nr_sects == 0 ) + { + /* + * Device either doesn't exist, or has zero capacity; we use a few + * cheesy heuristics to return the relevant error code + */ + if ( (gd->sizes[minor >> gd->minor_shift] != 0) || + ((minor & (gd->max_p - 1)) != 0) ) + { + /* + * We have a real device, but no such partition, or we just have a + * partition number so guess this is the problem. + */ + return -ENXIO; /* no such device or address */ + } + else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE ) + { + /* This is a removable device => assume that media is missing. */ + return -ENOMEDIUM; /* media not present (this is a guess) */ + } + else + { + /* Just go for the general 'no such device' error. */ + return -ENODEV; /* no such device */ + } + } + + /* Update of usage count is protected by per-device semaphore. */ + disk->usage++; + + return 0; +} + + +int blkif_release(struct inode *inode, struct file *filep) +{ + xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); + + /* + * When usage drops to zero it may allow more VBD updates to occur. + * Update of usage count is protected by a per-device semaphore. + */ + if ( --disk->usage == 0 ) + { +#if 0 + update_tq.routine = update_vbds_task; + schedule_task(&update_tq); +#endif + } + + return 0; +} + + +int blkif_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + kdev_t dev = inode->i_rdev; + struct hd_geometry *geo = (struct hd_geometry *)argument; + struct gendisk *gd; + struct hd_struct *part; + int i; + + /* NB. No need to check permissions. That is done for us. */ + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long) argument, dev); + + gd = get_gendisk(dev); + part = &gd->part[MINOR(dev)]; + + switch ( command ) + { + case BLKGETSIZE: + DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); + return put_user(part->nr_sects, (unsigned long *) argument); + + case BLKGETSIZE64: + DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64, + (u64)part->nr_sects * 512); + return put_user((u64)part->nr_sects * 512, (u64 *) argument); + + case BLKRRPART: /* re-read partition table */ + DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART); + return blkif_revalidate(dev); + + case BLKSSZGET: + return hardsect_size[MAJOR(dev)][MINOR(dev)]; + + case BLKBSZGET: /* get block size */ + DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET); + break; + + case BLKBSZSET: /* set block size */ + DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET); + break; + + case BLKRASET: /* set read-ahead */ + DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET); + break; + + case BLKRAGET: /* get read-ahead */ + DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET); + break; + + case HDIO_GETGEO: + /* note: these values are complete garbage */ + DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO); + if (!argument) return -EINVAL; + if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; + if (put_user(0xff, (byte *)&geo->heads)) return -EFAULT; + if (put_user(0x3f, (byte *)&geo->sectors)) return -EFAULT; + if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT; + return 0; + + case HDIO_GETGEO_BIG: + /* note: these values are complete garbage */ + DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG); + if (!argument) return -EINVAL; + if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; + if (put_user(0xff, (byte *)&geo->heads)) return -EFAULT; + if (put_user(0x3f, (byte *)&geo->sectors)) return -EFAULT; + if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT; + return 0; + + case CDROMMULTISESSION: + DPRINTK("FIXME: support multisession CDs later\n"); + for ( i = 0; i < sizeof(struct cdrom_multisession); i++ ) + if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT; + return 0; + + case SCSI_IOCTL_GET_BUS_NUMBER: + DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif"); + return -ENOSYS; + + default: + printk(KERN_ALERT "ioctl %08x not supported by XL blkif\n", command); + return -ENOSYS; + } + + return 0; +} + +/* check media change: should probably do something here in some cases :-) */ +int blkif_check(kdev_t dev) +{ + DPRINTK("blkif_check\n"); + return 0; +} + +int blkif_revalidate(kdev_t dev) +{ + struct block_device *bd; + struct gendisk *gd; + xl_disk_t *disk; + unsigned long capacity; + int i, rc = 0; + + if ( (bd = bdget(dev)) == NULL ) + return -EINVAL; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((gd = get_gendisk(dev)) == NULL) || + ((disk = xldev_to_xldisk(dev)) == NULL) || + ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) ) + { + rc = -EINVAL; + goto out; + } + + if ( disk->usage > 1 ) + { + rc = -EBUSY; + goto out; + } + + /* Only reread partition table if VBDs aren't mapped to partitions. */ + if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) ) + { + for ( i = gd->max_p - 1; i >= 0; i-- ) + { + invalidate_device(dev+i, 1); + gd->part[MINOR(dev+i)].start_sect = 0; + gd->part[MINOR(dev+i)].nr_sects = 0; + gd->sizes[MINOR(dev+i)] = 0; + } + + grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity); + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(unsigned long id, + int operation, + char * buffer, + unsigned long sector_number, + unsigned short nr_sectors, + kdev_t device) +{ + unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); + struct gendisk *gd; + blkif_request_t *req; + struct buffer_head *bh; + unsigned int fsect, lsect; + + fsect = (buffer_ma & ~PAGE_MASK) >> 9; + lsect = fsect + nr_sectors - 1; + + /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */ + if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) + BUG(); + if ( lsect > 7 ) + BUG(); + + buffer_ma &= PAGE_MASK; + + if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) + return 1; + + switch ( operation ) + { + + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + gd = get_gendisk(device); + + /* + * Update the sector_number we'll pass down as appropriate; note that + * we could sanity check that resulting sector will be in this + * partition, but this will happen in driver backend anyhow. + */ + sector_number += gd->part[MINOR(device)].start_sect; + + /* + * If this unit doesn't consist of virtual partitions then we clear + * the partn bits from the device number. + */ + if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & + GENHD_FL_VIRT_PARTNS) ) + device &= ~(gd->max_p - 1); + + if ( (sg_operation == operation) && + (sg_dev == device) && + (sg_next_sect == sector_number) ) + { + req = &blk_ring->ring[MASK_BLK_IDX(req_prod-1)].req; + bh = (struct buffer_head *)id; + bh->b_reqnext = (struct buffer_head *)req->id; + req->id = id; + req->frame_and_sects[req->nr_segments] = + buffer_ma | (fsect<<3) | lsect; + if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST ) + sg_next_sect += nr_sectors; + else + DISABLE_SCATTERGATHER(); + + /* Update the copy of the request in the recovery ring. */ + blk_ring_rec->ring[MASK_BLK_IDX(blk_ring_rec->req_prod - 1)].req + = *req; + + return 0; + } + else if ( RING_PLUGGED ) + { + return 1; + } + else + { + sg_operation = operation; + sg_dev = device; + sg_next_sect = sector_number + nr_sectors; + } + break; + + default: + panic("unknown op %d\n", operation); + } + + /* Fill out a communications ring structure. */ + req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req; + req->id = id; + req->operation = operation; + req->sector_number = (blkif_sector_t)sector_number; + req->device = device; + req->nr_segments = 1; + req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect; + req_prod++; + + /* Keep a private copy so we can reissue requests when recovering. */ + blk_ring_rec->ring[MASK_BLK_IDX(blk_ring_rec->req_prod)].req = *req; + blk_ring_rec->req_prod++; + + return 0; +} + + +/* + * do_blkif_request + * read a block; request is in a request queue + */ +void do_blkif_request(request_queue_t *rq) +{ + struct request *req; + struct buffer_head *bh, *next_bh; + int rw, nsect, full, queued = 0; + + DPRINTK("Entered do_blkif_request\n"); + + while ( !rq->plugged && !list_empty(&rq->queue_head)) + { + if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) + goto out; + + DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n", + req, req->cmd, req->sector, + req->current_nr_sectors, req->nr_sectors, req->bh); + + rw = req->cmd; + if ( rw == READA ) + rw = READ; + if ( unlikely((rw != READ) && (rw != WRITE)) ) + panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw); + + req->errors = 0; + + bh = req->bh; + while ( bh != NULL ) + { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + + full = blkif_queue_request( + (unsigned long)bh, + (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, + bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev); + + if ( full ) + { + bh->b_reqnext = next_bh; + pending_queues[nr_pending++] = rq; + if ( unlikely(nr_pending >= MAX_PENDING) ) + BUG(); + goto out; + } + + queued++; + + /* Dequeue the buffer head from the request. */ + nsect = bh->b_size >> 9; + bh = req->bh = next_bh; + + if ( bh != NULL ) + { + /* There's another buffer head to do. Update the request. */ + req->hard_sector += nsect; + req->hard_nr_sectors -= nsect; + req->sector = req->hard_sector; + req->nr_sectors = req->hard_nr_sectors; + req->current_nr_sectors = bh->b_size >> 9; + req->buffer = bh->b_data; + } + else + { + /* That was the last buffer head. Finalise the request. */ + if ( unlikely(end_that_request_first(req, 1, "XenBlk")) ) + BUG(); + blkdev_dequeue_request(req); + end_that_request_last(req); + } + } + } + + out: + if ( queued != 0 ) + flush_requests(); +} + + +static void kick_pending_request_queues(void) +{ + /* We kick pending request queues if the ring is reasonably empty. */ + if ( (nr_pending != 0) && + ((req_prod - resp_cons) < (BLK_RING_SIZE >> 1)) ) + { + /* Attempt to drain the queue, but bail if the ring becomes full. */ + while ( (nr_pending != 0) && !RING_PLUGGED ) + do_blkif_request(pending_queues[--nr_pending]); + } +} + + +static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + BLK_RING_IDX i; + unsigned long flags; + struct buffer_head *bh, *next_bh; + +// printk(KERN_ALERT "blkif_int\n"); + + spin_lock_irqsave(&io_request_lock, flags); + + if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) ) + { + printk("Bailed out\n"); + + spin_unlock_irqrestore(&io_request_lock, flags); + return; + } + + for ( i = resp_cons; i != blk_ring->resp_prod; i++ ) + { + blkif_response_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp; + switch ( bret->operation ) + { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) + DPRINTK("Bad return from blkdev data request: %lx\n", + bret->status); + for ( bh = (struct buffer_head *)bret->id; + bh != NULL; + bh = next_bh ) + { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY); + } + break; + case BLKIF_OP_PROBE: + memcpy(&blkif_control_rsp, bret, sizeof(*bret)); + blkif_control_rsp_valid = 1; + break; + default: + BUG(); + } + } + + resp_cons = i; + resp_cons_rec = i; + + kick_pending_request_queues(); + + spin_unlock_irqrestore(&io_request_lock, flags); +} + + +void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp) +{ + unsigned long flags; + + retry: + while ( (req_prod - resp_cons) == BLK_RING_SIZE ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + spin_lock_irqsave(&io_request_lock, flags); + if ( (req_prod - resp_cons) == BLK_RING_SIZE ) + { + spin_unlock_irqrestore(&io_request_lock, flags); + goto retry; + } + + DISABLE_SCATTERGATHER(); + memcpy(&blk_ring->ring[MASK_BLK_IDX(req_prod)].req, req, sizeof(*req)); + memcpy(&blk_ring_rec->ring[MASK_BLK_IDX(blk_ring_rec->req_prod++)].req, + req, sizeof(*req)); + req_prod++; + flush_requests(); + + spin_unlock_irqrestore(&io_request_lock, flags); + + while ( !blkif_control_rsp_valid ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + memcpy(rsp, &blkif_control_rsp, sizeof(*rsp)); + blkif_control_rsp_valid = 0; +} + + +static void blkif_status_change(blkif_fe_interface_status_changed_t *status) +{ + ctrl_msg_t cmsg; + blkif_fe_interface_connect_t up; + + if ( status->handle != 0 ) + { + printk(KERN_WARNING "Status change on unsupported blkif %d\n", + status->handle); + return; + } + + switch ( status->status ) + { + case BLKIF_INTERFACE_STATUS_DESTROYED: + printk(KERN_WARNING "Unexpected blkif-DESTROYED message in state %d\n", + blkif_state); + break; + + case BLKIF_INTERFACE_STATUS_DISCONNECTED: + if ( blkif_state != BLKIF_STATE_CLOSED ) + { + printk(KERN_WARNING "Unexpected blkif-DISCONNECTED message" + " in state %d\n", blkif_state); + + printk(KERN_INFO "VBD driver recovery in progress\n"); + + /* Prevent new requests being issued until we've fixed things up. */ + spin_lock_irq(&io_request_lock); + recovery = 1; + blkif_state = BLKIF_STATE_DISCONNECTED; + spin_unlock_irq(&io_request_lock); + + /* Free resources associated with old device channel. */ + free_page((unsigned long)blk_ring); + free_irq(blkif_irq, NULL); + unbind_evtchn_from_irq(blkif_evtchn); + } + + /* Move from CLOSED to DISCONNECTED state. */ + blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; + blkif_state = BLKIF_STATE_DISCONNECTED; + + /* Construct an interface-CONNECT message for the domain controller. */ + cmsg.type = CMSG_BLKIF_FE; + cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; + cmsg.length = sizeof(blkif_fe_interface_connect_t); + up.handle = 0; + up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; + memcpy(cmsg.msg, &up, sizeof(up)); + + /* Tell the controller to bring up the interface. */ + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + break; + + case BLKIF_INTERFACE_STATUS_CONNECTED: + if ( blkif_state == BLKIF_STATE_CLOSED ) + { + printk(KERN_WARNING "Unexpected blkif-CONNECTED message" + " in state %d\n", blkif_state); + break; + } + + blkif_evtchn = status->evtchn; + blkif_irq = bind_evtchn_to_irq(blkif_evtchn); + (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); + + if ( recovery ) + { + int i; + + /* Shouldn't need the io_request_lock here - the device is + * plugged and the recovery flag prevents the interrupt handler + * changing anything. */ + + /* Reissue requests from the private block ring. */ + for ( i = 0; + resp_cons_rec < blk_ring_rec->req_prod; + resp_cons_rec++, i++ ) + { + blk_ring->ring[i].req + = blk_ring_rec->ring[MASK_BLK_IDX(resp_cons_rec)].req; + } + + /* Reset the private block ring to match the new ring. */ + memcpy(blk_ring_rec, blk_ring, sizeof(*blk_ring)); + resp_cons_rec = 0; + + /* blk_ring->req_prod will be set when we flush_requests().*/ + blk_ring_rec->req_prod = req_prod = i; + + wmb(); + + /* Switch off recovery mode, using a memory barrier to ensure that + * it's seen before we flush requests - we don't want to miss any + * interrupts. */ + recovery = 0; + wmb(); + + /* Kicks things back into life. */ + flush_requests(); + } + else + { + /* Probe for discs that are attached to the interface. */ + xlvbd_init(); + } + + blkif_state = BLKIF_STATE_CONNECTED; + + /* Kick pending requests. */ + spin_lock_irq(&io_request_lock); + kick_pending_request_queues(); + spin_unlock_irq(&io_request_lock); + + break; + + default: + printk(KERN_WARNING "Status change to unknown value %d\n", + status->status); + break; + } +} + + +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_BLKIF_FE_INTERFACE_STATUS_CHANGED: + if ( msg->length != sizeof(blkif_fe_interface_status_changed_t) ) + goto parse_error; + blkif_status_change((blkif_fe_interface_status_changed_t *) + &msg->msg[0]); + break; +#if 0 + case CMSG_BLKIF_FE_VBD_STATUS_CHANGED: + update_tq.routine = update_vbds_task; + schedule_task(&update_tq); + break; +#endif + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} + + +int __init xlblk_init(void) +{ + ctrl_msg_t cmsg; + blkif_fe_driver_status_changed_t st; + + if ( (start_info.flags & SIF_INITDOMAIN) + || (start_info.flags & SIF_BLK_BE_DOMAIN) ) + return 0; + + printk(KERN_INFO "Initialising Xen virtual block device\n"); + + blk_ring_rec = (blkif_ring_t *)__get_free_page(GFP_KERNEL); + memset(blk_ring_rec, 0, sizeof(*blk_ring_rec)); + + (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_BLKIF_FE; + cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(blkif_fe_driver_status_changed_t); + st.status = BLKIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + + /* + * We should read 'nr_interfaces' from response message and wait + * for notifications before proceeding. For now we assume that we + * will be notified of exactly one interface. + */ + while ( blkif_state != BLKIF_STATE_CONNECTED ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + return 0; +} + + +static void __exit xlblk_cleanup(void) +{ + /* XXX FIXME */ + BUG(); +} + + +#ifdef MODULE +module_init(xlblk_init); +module_exit(xlblk_cleanup); +#endif + + +void blkdev_suspend(void) +{ + /* XXX FIXME */ + BUG(); +} + + +void blkdev_resume(void) +{ + /* XXX FIXME */ + BUG(); +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c new file mode 100644 index 0000000000..12ce976cb5 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c @@ -0,0 +1,561 @@ +/****************************************************************************** + * arch/xen/drivers/blkif/frontend/vbd.c + * + * Xenolinux virtual block-device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + */ + +#include "common.h" +#include <linux/blk.h> + +/* + * For convenience we distinguish between ide, scsi and 'other' (i.e. + * potentially combinations of the two) in the naming scheme and in a few + * other places (like default readahead, etc). + */ +#define XLIDE_MAJOR_NAME "hd" +#define XLSCSI_MAJOR_NAME "sd" +#define XLVBD_MAJOR_NAME "xvd" + +#define XLIDE_DEVS_PER_MAJOR 2 +#define XLSCSI_DEVS_PER_MAJOR 16 +#define XLVBD_DEVS_PER_MAJOR 16 + +#define XLIDE_PARTN_SHIFT 6 /* amount to shift minor to get 'real' minor */ +#define XLIDE_MAX_PART (1 << XLIDE_PARTN_SHIFT) /* minors per ide vbd */ + +#define XLSCSI_PARTN_SHIFT 4 /* amount to shift minor to get 'real' minor */ +#define XLSCSI_MAX_PART (1 << XLSCSI_PARTN_SHIFT) /* minors per scsi vbd */ + +#define XLVBD_PARTN_SHIFT 4 /* amount to shift minor to get 'real' minor */ +#define XLVBD_MAX_PART (1 << XLVBD_PARTN_SHIFT) /* minors per 'other' vbd */ + +/* The below are for the generic drivers/block/ll_rw_block.c code. */ +static int xlide_blksize_size[256]; +static int xlide_hardsect_size[256]; +static int xlide_max_sectors[256]; +static int xlscsi_blksize_size[256]; +static int xlscsi_hardsect_size[256]; +static int xlscsi_max_sectors[256]; +static int xlvbd_blksize_size[256]; +static int xlvbd_hardsect_size[256]; +static int xlvbd_max_sectors[256]; + +/* Information about our VBDs. */ +#define MAX_VBDS 64 +static int nr_vbds; +static vdisk_t *vbd_info; + +static struct block_device_operations xlvbd_block_fops = +{ + open: blkif_open, + release: blkif_release, + ioctl: blkif_ioctl, + check_media_change: blkif_check, + revalidate: blkif_revalidate, +}; + +static int xlvbd_get_vbd_info(vdisk_t *disk_info) +{ + vdisk_t *buf = (vdisk_t *)__get_free_page(GFP_KERNEL); + blkif_request_t req; + blkif_response_t rsp; + int nr; + + memset(&req, 0, sizeof(req)); + req.operation = BLKIF_OP_PROBE; + req.nr_segments = 1; + req.frame_and_sects[0] = virt_to_machine(buf) | 7; + + blkif_control_send(&req, &rsp); + + if ( rsp.status <= 0 ) + { + printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status); + return -1; + } + + if ( (nr = rsp.status) > MAX_VBDS ) + nr = MAX_VBDS; + memcpy(disk_info, buf, nr * sizeof(vdisk_t)); + + return nr; +} + +/* + * xlvbd_init_device - initialise a VBD device + * @disk: a vdisk_t describing the VBD + * + * Takes a vdisk_t * that describes a VBD the domain has access to. + * Performs appropriate initialisation and registration of the device. + * + * Care needs to be taken when making re-entrant calls to ensure that + * corruption does not occur. Also, devices that are in use should not have + * their details updated. This is the caller's responsibility. + */ +static int xlvbd_init_device(vdisk_t *xd) +{ + int device = xd->device; + int major = MAJOR(device); + int minor = MINOR(device); + int is_ide = IDE_DISK_MAJOR(major); /* is this an ide device? */ + int is_scsi= SCSI_BLK_MAJOR(major); /* is this a scsi device? */ + char *major_name; + struct gendisk *gd; + struct block_device *bd; + xl_disk_t *disk; + int i, rc = 0, max_part, partno; + unsigned long capacity; + + unsigned char buf[64]; + + if ( (bd = bdget(device)) == NULL ) + return -1; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((disk = xldev_to_xldisk(device)) != NULL) && (disk->usage != 0) ) + { + printk(KERN_ALERT "VBD update failed - in use [dev=%x]\n", device); + rc = -1; + goto out; + } + + if ( is_ide ) { + + major_name = XLIDE_MAJOR_NAME; + max_part = XLIDE_MAX_PART; + + } else if ( is_scsi ) { + + major_name = XLSCSI_MAJOR_NAME; + max_part = XLSCSI_MAX_PART; + + } else if (XD_VIRTUAL(xd->info)) { + + major_name = XLVBD_MAJOR_NAME; + max_part = XLVBD_MAX_PART; + + } else { + + /* SMH: hmm - probably a CCISS driver or sim; assume CCISS for now */ + printk(KERN_ALERT "Assuming device %02x:%02x is CCISS/SCSI\n", + major, minor); + is_scsi = 1; + major_name = "cciss"; + max_part = XLSCSI_MAX_PART; + + } + + partno = minor & (max_part - 1); + + if ( (gd = get_gendisk(device)) == NULL ) + { + rc = register_blkdev(major, major_name, &xlvbd_block_fops); + if ( rc < 0 ) + { + printk(KERN_ALERT "XL VBD: can't get major %d\n", major); + goto out; + } + + if ( is_ide ) + { + blksize_size[major] = xlide_blksize_size; + hardsect_size[major] = xlide_hardsect_size; + max_sectors[major] = xlide_max_sectors; + read_ahead[major] = 8; /* from drivers/ide/ide-probe.c */ + } + else if ( is_scsi ) + { + blksize_size[major] = xlscsi_blksize_size; + hardsect_size[major] = xlscsi_hardsect_size; + max_sectors[major] = xlscsi_max_sectors; + read_ahead[major] = 0; /* XXX 8; -- guessing */ + } + else + { + blksize_size[major] = xlvbd_blksize_size; + hardsect_size[major] = xlvbd_hardsect_size; + max_sectors[major] = xlvbd_max_sectors; + read_ahead[major] = 8; + } + + blk_init_queue(BLK_DEFAULT_QUEUE(major), do_blkif_request); + + /* + * Turn off barking 'headactive' mode. We dequeue buffer heads as + * soon as we pass them to the back-end driver. + */ + blk_queue_headactive(BLK_DEFAULT_QUEUE(major), 0); + + /* Construct an appropriate gendisk structure. */ + gd = kmalloc(sizeof(struct gendisk), GFP_KERNEL); + gd->major = major; + gd->major_name = major_name; + + gd->max_p = max_part; + if ( is_ide ) + { + gd->minor_shift = XLIDE_PARTN_SHIFT; + gd->nr_real = XLIDE_DEVS_PER_MAJOR; + } + else if ( is_scsi ) + { + gd->minor_shift = XLSCSI_PARTN_SHIFT; + gd->nr_real = XLSCSI_DEVS_PER_MAJOR; + } + else + { + gd->minor_shift = XLVBD_PARTN_SHIFT; + gd->nr_real = XLVBD_DEVS_PER_MAJOR; + } + + /* + ** The sizes[] and part[] arrays hold the sizes and other + ** information about every partition with this 'major' (i.e. + ** every disk sharing the 8 bit prefix * max partns per disk) + */ + gd->sizes = kmalloc(max_part*gd->nr_real*sizeof(int), GFP_KERNEL); + gd->part = kmalloc(max_part*gd->nr_real*sizeof(struct hd_struct), + GFP_KERNEL); + memset(gd->sizes, 0, max_part * gd->nr_real * sizeof(int)); + memset(gd->part, 0, max_part * gd->nr_real + * sizeof(struct hd_struct)); + + + gd->real_devices = kmalloc(gd->nr_real * sizeof(xl_disk_t), + GFP_KERNEL); + memset(gd->real_devices, 0, gd->nr_real * sizeof(xl_disk_t)); + + gd->next = NULL; + gd->fops = &xlvbd_block_fops; + + gd->de_arr = kmalloc(gd->nr_real * sizeof(*gd->de_arr), + GFP_KERNEL); + gd->flags = kmalloc(gd->nr_real * sizeof(*gd->flags), GFP_KERNEL); + + memset(gd->de_arr, 0, gd->nr_real * sizeof(*gd->de_arr)); + memset(gd->flags, 0, gd->nr_real * sizeof(*gd->flags)); + + add_gendisk(gd); + + blk_size[major] = gd->sizes; + } + + if ( XD_READONLY(xd->info) ) + set_device_ro(device, 1); + + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_XEN; + + /* NB. Linux 2.4 only handles 32-bit sector offsets and capacities. */ + capacity = (unsigned long)xd->capacity; + + if ( partno != 0 ) + { + /* + * If this was previously set up as a real disc we will have set + * up partition-table information. Virtual partitions override + * 'real' partitions, and the two cannot coexist on a device. + */ + if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) && + (gd->sizes[minor & ~(max_part-1)] != 0) ) + { + /* + * Any non-zero sub-partition entries must be cleaned out before + * installing 'virtual' partition entries. The two types cannot + * coexist, and virtual partitions are favoured. + */ + kdev_t dev = device & ~(max_part-1); + for ( i = max_part - 1; i > 0; i-- ) + { + invalidate_device(dev+i, 1); + gd->part[MINOR(dev+i)].start_sect = 0; + gd->part[MINOR(dev+i)].nr_sects = 0; + gd->sizes[MINOR(dev+i)] = 0; + } + printk(KERN_ALERT + "Virtual partitions found for /dev/%s - ignoring any " + "real partition information we may have found.\n", + disk_name(gd, MINOR(device), buf)); + } + + /* Need to skankily setup 'partition' information */ + gd->part[minor].start_sect = 0; + gd->part[minor].nr_sects = capacity; + gd->sizes[minor] = capacity; + + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS; + } + else + { + gd->part[minor].nr_sects = capacity; + gd->sizes[minor] = capacity>>(BLOCK_SIZE_BITS-9); + + /* Some final fix-ups depending on the device type */ + switch ( XD_TYPE(xd->info) ) + { + case XD_TYPE_CDROM: + case XD_TYPE_FLOPPY: + case XD_TYPE_TAPE: + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_REMOVABLE; + printk(KERN_ALERT + "Skipping partition check on %s /dev/%s\n", + XD_TYPE(xd->info)==XD_TYPE_CDROM ? "cdrom" : + (XD_TYPE(xd->info)==XD_TYPE_TAPE ? "tape" : + "floppy"), disk_name(gd, MINOR(device), buf)); + break; + + case XD_TYPE_DISK: + /* Only check partitions on real discs (not virtual!). */ + if ( gd->flags[minor>>gd->minor_shift] & GENHD_FL_VIRT_PARTNS ) + { + printk(KERN_ALERT + "Skipping partition check on virtual /dev/%s\n", + disk_name(gd, MINOR(device), buf)); + break; + } + register_disk(gd, device, gd->max_p, &xlvbd_block_fops, capacity); + break; + + default: + printk(KERN_ALERT "XenoLinux: unknown device type %d\n", + XD_TYPE(xd->info)); + break; + } + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + + +/* + * xlvbd_remove_device - remove a device node if possible + * @device: numeric device ID + * + * Updates the gendisk structure and invalidates devices. + * + * This is OK for now but in future, should perhaps consider where this should + * deallocate gendisks / unregister devices. + */ +static int xlvbd_remove_device(int device) +{ + int i, rc = 0, minor = MINOR(device); + struct gendisk *gd; + struct block_device *bd; + xl_disk_t *disk = NULL; + + if ( (bd = bdget(device)) == NULL ) + return -1; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((gd = get_gendisk(device)) == NULL) || + ((disk = xldev_to_xldisk(device)) == NULL) ) + BUG(); + + if ( disk->usage != 0 ) + { + printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device); + rc = -1; + goto out; + } + + if ( (minor & (gd->max_p-1)) != 0 ) + { + /* 1: The VBD is mapped to a partition rather than a whole unit. */ + invalidate_device(device, 1); + gd->part[minor].start_sect = 0; + gd->part[minor].nr_sects = 0; + gd->sizes[minor] = 0; + + /* Clear the consists-of-virtual-partitions flag if possible. */ + gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS; + for ( i = 1; i < gd->max_p; i++ ) + if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 ) + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS; + + /* + * If all virtual partitions are now gone, and a 'whole unit' VBD is + * present, then we can try to grok the unit's real partition table. + */ + if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) && + (gd->sizes[minor & ~(gd->max_p-1)] != 0) && + !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) ) + { + register_disk(gd, + device&~(gd->max_p-1), + gd->max_p, + &xlvbd_block_fops, + gd->part[minor&~(gd->max_p-1)].nr_sects); + } + } + else + { + /* + * 2: The VBD is mapped to an entire 'unit'. Clear all partitions. + * NB. The partition entries are only cleared if there are no VBDs + * mapped to individual partitions on this unit. + */ + i = gd->max_p - 1; /* Default: clear subpartitions as well. */ + if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS ) + i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */ + while ( i >= 0 ) + { + invalidate_device(device+i, 1); + gd->part[minor+i].start_sect = 0; + gd->part[minor+i].nr_sects = 0; + gd->sizes[minor+i] = 0; + i--; + } + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + +/* + * xlvbd_update_vbds - reprobes the VBD status and performs updates driver + * state. The VBDs need to be updated in this way when the domain is + * initialised and also each time we receive an XLBLK_UPDATE event. + */ +void xlvbd_update_vbds(void) +{ + int i, j, k, old_nr, new_nr; + vdisk_t *old_info, *new_info, *merged_info; + + old_info = vbd_info; + old_nr = nr_vbds; + + new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); + if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 ) + { + kfree(new_info); + return; + } + + /* + * Final list maximum size is old list + new list. This occurs only when + * old list and new list do not overlap at all, and we cannot yet destroy + * VBDs in the old list because the usage counts are busy. + */ + merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL); + + /* @i tracks old list; @j tracks new list; @k tracks merged list. */ + i = j = k = 0; + + while ( (i < old_nr) && (j < new_nr) ) + { + if ( old_info[i].device < new_info[j].device ) + { + if ( xlvbd_remove_device(old_info[i].device) != 0 ) + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); + i++; + } + else if ( old_info[i].device > new_info[j].device ) + { + if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); + j++; + } + else + { + if ( ((old_info[i].capacity == new_info[j].capacity) && + (old_info[i].info == new_info[j].info)) || + (xlvbd_remove_device(old_info[i].device) != 0) ) + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); + else if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); + i++; j++; + } + } + + for ( ; i < old_nr; i++ ) + { + if ( xlvbd_remove_device(old_info[i].device) != 0 ) + memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); + } + + for ( ; j < new_nr; j++ ) + { + if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); + } + + vbd_info = merged_info; + nr_vbds = k; + + kfree(old_info); + kfree(new_info); +} + + +/* + * Set up all the linux device goop for the virtual block devices (vbd's) that + * we know about. Note that although from the backend driver's p.o.v. VBDs are + * addressed simply an opaque 16-bit device number, the domain creation tools + * conventionally allocate these numbers to correspond to those used by 'real' + * linux -- this is just for convenience as it means e.g. that the same + * /etc/fstab can be used when booting with or without Xen. + */ +int xlvbd_init(void) +{ + int i; + + /* + * If compiled as a module, we don't support unloading yet. We therefore + * permanently increment the reference count to disallow it. + */ + SET_MODULE_OWNER(&xlvbd_block_fops); + MOD_INC_USE_COUNT; + + /* Initialize the global arrays. */ + for ( i = 0; i < 256; i++ ) + { + /* from the generic ide code (drivers/ide/ide-probe.c, etc) */ + xlide_blksize_size[i] = 1024; + xlide_hardsect_size[i] = 512; + xlide_max_sectors[i] = 128; /* 'hwif->rqsize' if we knew it */ + + /* from the generic scsi disk code (drivers/scsi/sd.c) */ + xlscsi_blksize_size[i] = 1024; /* XXX 512; */ + xlscsi_hardsect_size[i] = 512; + xlscsi_max_sectors[i] = 128*8; /* XXX 128; */ + + /* we don't really know what to set these too since it depends */ + xlvbd_blksize_size[i] = 512; + xlvbd_hardsect_size[i] = 512; + xlvbd_max_sectors[i] = 128; + } + + vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); + nr_vbds = xlvbd_get_vbd_info(vbd_info); + + if ( nr_vbds < 0 ) + { + kfree(vbd_info); + vbd_info = NULL; + nr_vbds = 0; + } + else + { + for ( i = 0; i < nr_vbds; i++ ) + xlvbd_init_device(&vbd_info[i]); + } + + return 0; +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/Makefile new file mode 100644 index 0000000000..35986ca54a --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := block.o vbd.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.c new file mode 100644 index 0000000000..43a6a23479 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.c @@ -0,0 +1,625 @@ +/****************************************************************************** + * block.c + * + * Xenolinux virtual block-device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + */ + +#include "block.h" +#include <linux/blk.h> +#include <linux/cdrom.h> +#include <linux/tqueue.h> +#include <linux/sched.h> +#include <scsi/scsi.h> + +#include <linux/interrupt.h> + +typedef unsigned char byte; /* from linux/ide.h */ + +#define STATE_ACTIVE 0 +#define STATE_SUSPENDED 1 +#define STATE_CLOSED 2 +static unsigned int state = STATE_SUSPENDED; + +/* Dynamically-mapped IRQs. */ +static int xlblk_response_irq, xlblk_update_irq; + +static blk_ring_t *blk_ring; +static BLK_RING_IDX resp_cons; /* Response consumer for comms ring. */ +static BLK_RING_IDX req_prod; /* Private request producer. */ + +/* We plug the I/O ring if the driver is suspended or if the ring is full. */ +#define RING_PLUGGED (((req_prod - resp_cons) == BLK_RING_SIZE) || \ + (state != STATE_ACTIVE)) + + +/* + * Request queues with outstanding work, but ring is currently full. + * We need no special lock here, as we always access this with the + * io_request_lock held. We only need a small maximum list. + */ +#define MAX_PENDING 8 +static request_queue_t *pending_queues[MAX_PENDING]; +static int nr_pending; + +static kdev_t sg_dev; +static int sg_operation = -1; +static unsigned long sg_next_sect; +#define DISABLE_SCATTERGATHER() (sg_operation = -1) + +static inline void signal_requests_to_xen(void) +{ + block_io_op_t op; + + DISABLE_SCATTERGATHER(); + blk_ring->req_prod = req_prod; + + op.cmd = BLOCK_IO_OP_SIGNAL; + HYPERVISOR_block_io_op(&op); + return; +} + + +/* + * xlblk_update_int/update-vbds_task - handle VBD update events from Xen + * + * Schedule a task for keventd to run, which will update the VBDs and perform + * the corresponding updates to our view of VBD state, so the XenoLinux will + * respond to changes / additions / deletions to the set of VBDs automatically. + */ +static struct tq_struct update_tq; +static void update_vbds_task(void *unused) +{ + xlvbd_update_vbds(); +} +static void xlblk_update_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + update_tq.routine = update_vbds_task; + schedule_task(&update_tq); +} + + +int xen_block_open(struct inode *inode, struct file *filep) +{ + short xldev = inode->i_rdev; + struct gendisk *gd = get_gendisk(xldev); + xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); + short minor = MINOR(xldev); + + if ( gd->part[minor].nr_sects == 0 ) + { + /* + * Device either doesn't exist, or has zero capacity; we use a few + * cheesy heuristics to return the relevant error code + */ + if ( (gd->sizes[minor >> gd->minor_shift] != 0) || + ((minor & (gd->max_p - 1)) != 0) ) + { + /* + * We have a real device, but no such partition, or we just have a + * partition number so guess this is the problem. + */ + return -ENXIO; /* no such device or address */ + } + else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE ) + { + /* This is a removable device => assume that media is missing. */ + return -ENOMEDIUM; /* media not present (this is a guess) */ + } + else + { + /* Just go for the general 'no such device' error. */ + return -ENODEV; /* no such device */ + } + } + + /* Update of usage count is protected by per-device semaphore. */ + disk->usage++; + + return 0; +} + + +int xen_block_release(struct inode *inode, struct file *filep) +{ + xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); + + /* + * When usage drops to zero it may allow more VBD updates to occur. + * Update of usage count is protected by a per-device semaphore. + */ + if ( --disk->usage == 0 ) + { + update_tq.routine = update_vbds_task; + schedule_task(&update_tq); + } + + return 0; +} + + +int xen_block_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument) +{ + kdev_t dev = inode->i_rdev; + struct hd_geometry *geo = (struct hd_geometry *)argument; + struct gendisk *gd; + struct hd_struct *part; + int i; + + /* NB. No need to check permissions. That is done for us. */ + + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", + command, (long) argument, dev); + + gd = get_gendisk(dev); + part = &gd->part[MINOR(dev)]; + + switch ( command ) + { + case BLKGETSIZE: + DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); + return put_user(part->nr_sects, (unsigned long *) argument); + + case BLKGETSIZE64: + DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64, + (u64)part->nr_sects * 512); + return put_user((u64)part->nr_sects * 512, (u64 *) argument); + + case BLKRRPART: /* re-read partition table */ + DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART); + return xen_block_revalidate(dev); + + case BLKSSZGET: + return hardsect_size[MAJOR(dev)][MINOR(dev)]; + + case BLKBSZGET: /* get block size */ + DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET); + break; + + case BLKBSZSET: /* set block size */ + DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET); + break; + + case BLKRASET: /* set read-ahead */ + DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET); + break; + + case BLKRAGET: /* get read-ahead */ + DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET); + break; + + case HDIO_GETGEO: + /* note: these values are complete garbage */ + DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO); + if (!argument) return -EINVAL; + if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; + if (put_user(0xff, (byte *)&geo->heads)) return -EFAULT; + if (put_user(0x3f, (byte *)&geo->sectors)) return -EFAULT; + if (put_user(0x106, (unsigned short *)&geo->cylinders)) return -EFAULT; + return 0; + + case HDIO_GETGEO_BIG: + /* note: these values are complete garbage */ + DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG); + if (!argument) return -EINVAL; + if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; + if (put_user(0xff, (byte *)&geo->heads)) return -EFAULT; + if (put_user(0x3f, (byte *)&geo->sectors)) return -EFAULT; + if (put_user(0x106, (unsigned int *) &geo->cylinders)) return -EFAULT; + return 0; + + case CDROMMULTISESSION: + DPRINTK("FIXME: support multisession CDs later\n"); + for ( i = 0; i < sizeof(struct cdrom_multisession); i++ ) + if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT; + return 0; + + case SCSI_IOCTL_GET_BUS_NUMBER: + DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in Xen blkdev"); + return -ENOSYS; + + default: + printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", command); + return -ENOSYS; + } + + return 0; +} + +/* check media change: should probably do something here in some cases :-) */ +int xen_block_check(kdev_t dev) +{ + DPRINTK("xen_block_check\n"); + return 0; +} + +int xen_block_revalidate(kdev_t dev) +{ + struct block_device *bd; + struct gendisk *gd; + xl_disk_t *disk; + unsigned long capacity; + int i, rc = 0; + + if ( (bd = bdget(dev)) == NULL ) + return -EINVAL; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((gd = get_gendisk(dev)) == NULL) || + ((disk = xldev_to_xldisk(dev)) == NULL) || + ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) ) + { + rc = -EINVAL; + goto out; + } + + if ( disk->usage > 1 ) + { + rc = -EBUSY; + goto out; + } + + /* Only reread partition table if VBDs aren't mapped to partitions. */ + if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) ) + { + for ( i = gd->max_p - 1; i >= 0; i-- ) + { + invalidate_device(dev+i, 1); + gd->part[MINOR(dev+i)].start_sect = 0; + gd->part[MINOR(dev+i)].nr_sects = 0; + gd->sizes[MINOR(dev+i)] = 0; + } + + grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity); + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + + +/* + * hypervisor_request + * + * request block io + * + * id: for guest use only. + * operation: XEN_BLOCK_{READ,WRITE,PROBE,VBD*} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int hypervisor_request(unsigned long id, + int operation, + char * buffer, + unsigned long sector_number, + unsigned short nr_sectors, + kdev_t device) +{ + unsigned long buffer_ma = phys_to_machine(virt_to_phys(buffer)); + struct gendisk *gd; + blk_ring_req_entry_t *req; + struct buffer_head *bh; + + if ( unlikely(nr_sectors >= (1<<9)) ) + BUG(); + if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) + BUG(); + + if ( unlikely(state == STATE_CLOSED) ) + return 1; + + switch ( operation ) + { + + case XEN_BLOCK_READ: + case XEN_BLOCK_WRITE: + gd = get_gendisk(device); + + /* + * Update the sector_number we'll pass down as appropriate; note that + * we could sanity check that resulting sector will be in this + * partition, but this will happen in xen anyhow. + */ + sector_number += gd->part[MINOR(device)].start_sect; + + /* + * If this unit doesn't consist of virtual (i.e., Xen-specified) + * partitions then we clear the partn bits from the device number. + */ + if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & + GENHD_FL_VIRT_PARTNS) ) + device &= ~(gd->max_p - 1); + + if ( (sg_operation == operation) && + (sg_dev == device) && + (sg_next_sect == sector_number) ) + { + req = &blk_ring->ring[MASK_BLK_IDX(req_prod-1)].req; + bh = (struct buffer_head *)id; + bh->b_reqnext = (struct buffer_head *)req->id; + req->id = id; + req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors; + if ( ++req->nr_segments < MAX_BLK_SEGS ) + sg_next_sect += nr_sectors; + else + DISABLE_SCATTERGATHER(); + return 0; + } + else if ( RING_PLUGGED ) + { + return 1; + } + else + { + sg_operation = operation; + sg_dev = device; + sg_next_sect = sector_number + nr_sectors; + } + break; + + default: + panic("unknown op %d\n", operation); + } + + /* Fill out a communications ring structure. */ + req = &blk_ring->ring[MASK_BLK_IDX(req_prod)].req; + req->id = id; + req->operation = operation; + req->sector_number = (xen_sector_t)sector_number; + req->device = device; + req->nr_segments = 1; + req->buffer_and_sects[0] = buffer_ma | nr_sectors; + req_prod++; + + return 0; +} + + +/* + * do_xlblk_request + * read a block; request is in a request queue + */ +void do_xlblk_request(request_queue_t *rq) +{ + struct request *req; + struct buffer_head *bh, *next_bh; + int rw, nsect, full, queued = 0; + + DPRINTK("xlblk.c::do_xlblk_request\n"); + + while ( !rq->plugged && !list_empty(&rq->queue_head)) + { + if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) + goto out; + + DPRINTK("do_xlblk_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n", + req, req->cmd, req->sector, + req->current_nr_sectors, req->nr_sectors, req->bh); + + rw = req->cmd; + if ( rw == READA ) + rw = READ; + if ( unlikely((rw != READ) && (rw != WRITE)) ) + panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw); + + req->errors = 0; + + bh = req->bh; + while ( bh != NULL ) + { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + + full = hypervisor_request( + (unsigned long)bh, + (rw == READ) ? XEN_BLOCK_READ : XEN_BLOCK_WRITE, + bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev); + + if ( full ) + { + bh->b_reqnext = next_bh; + pending_queues[nr_pending++] = rq; + if ( unlikely(nr_pending >= MAX_PENDING) ) + BUG(); + goto out; + } + + queued++; + + /* Dequeue the buffer head from the request. */ + nsect = bh->b_size >> 9; + bh = req->bh = next_bh; + + if ( bh != NULL ) + { + /* There's another buffer head to do. Update the request. */ + req->hard_sector += nsect; + req->hard_nr_sectors -= nsect; + req->sector = req->hard_sector; + req->nr_sectors = req->hard_nr_sectors; + req->current_nr_sectors = bh->b_size >> 9; + req->buffer = bh->b_data; + } + else + { + /* That was the last buffer head. Finalise the request. */ + if ( unlikely(end_that_request_first(req, 1, "XenBlk")) ) + BUG(); + blkdev_dequeue_request(req); + end_that_request_last(req); + } + } + } + + out: + if ( queued != 0 ) signal_requests_to_xen(); +} + + +static void kick_pending_request_queues(void) +{ + /* We kick pending request queues if the ring is reasonably empty. */ + if ( (nr_pending != 0) && + ((req_prod - resp_cons) < (BLK_RING_SIZE >> 1)) ) + { + /* Attempt to drain the queue, but bail if the ring becomes full. */ + while ( (nr_pending != 0) && !RING_PLUGGED ) + do_xlblk_request(pending_queues[--nr_pending]); + } +} + + +static void xlblk_response_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + BLK_RING_IDX i; + unsigned long flags; + struct buffer_head *bh, *next_bh; + + if ( unlikely(state == STATE_CLOSED) ) + return; + + spin_lock_irqsave(&io_request_lock, flags); + + for ( i = resp_cons; i != blk_ring->resp_prod; i++ ) + { + blk_ring_resp_entry_t *bret = &blk_ring->ring[MASK_BLK_IDX(i)].resp; + switch ( bret->operation ) + { + case XEN_BLOCK_READ: + case XEN_BLOCK_WRITE: + if ( unlikely(bret->status != 0) ) + DPRINTK("Bad return from blkdev data request: %lx\n", + bret->status); + for ( bh = (struct buffer_head *)bret->id; + bh != NULL; + bh = next_bh ) + { + next_bh = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, !bret->status); + } + break; + + default: + BUG(); + } + } + + resp_cons = i; + + kick_pending_request_queues(); + + spin_unlock_irqrestore(&io_request_lock, flags); +} + + +static void reset_xlblk_interface(void) +{ + block_io_op_t op; + + op.cmd = BLOCK_IO_OP_RESET; + if ( HYPERVISOR_block_io_op(&op) != 0 ) + printk(KERN_ALERT "Possible blkdev trouble: couldn't reset ring\n"); + + op.cmd = BLOCK_IO_OP_RING_ADDRESS; + (void)HYPERVISOR_block_io_op(&op); + + set_fixmap(FIX_BLKRING_BASE, op.u.ring_mfn << PAGE_SHIFT); + blk_ring = (blk_ring_t *)fix_to_virt(FIX_BLKRING_BASE); + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; + + wmb(); + state = STATE_ACTIVE; +} + + +int __init xlblk_init(void) +{ + int error; + + nr_pending = 0; + + reset_xlblk_interface(); + + xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV); + xlblk_update_irq = bind_virq_to_irq(VIRQ_VBD_UPD); + + error = request_irq(xlblk_response_irq, xlblk_response_int, + SA_SAMPLE_RANDOM, "blkdev", NULL); + if ( error ) + { + printk(KERN_ALERT "Could not allocate receive interrupt\n"); + goto fail; + } + + error = request_irq(xlblk_update_irq, xlblk_update_int, + 0, "blkdev", NULL); + + if ( error ) + { + printk(KERN_ALERT "Could not allocate block update interrupt\n"); + goto fail; + } + + (void)xlvbd_init(); + + return 0; + + fail: + return error; +} + + +static void __exit xlblk_cleanup(void) +{ + xlvbd_cleanup(); + free_irq(xlblk_response_irq, NULL); + free_irq(xlblk_update_irq, NULL); + unbind_virq_from_irq(VIRQ_BLKDEV); + unbind_virq_from_irq(VIRQ_VBD_UPD); +} + + +#ifdef MODULE +module_init(xlblk_init); +module_exit(xlblk_cleanup); +#endif + + +void blkdev_suspend(void) +{ + state = STATE_SUSPENDED; + wmb(); + + while ( resp_cons != blk_ring->req_prod ) + { + barrier(); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + + wmb(); + state = STATE_CLOSED; + wmb(); + + clear_fixmap(FIX_BLKRING_BASE); +} + + +void blkdev_resume(void) +{ + reset_xlblk_interface(); + spin_lock_irq(&io_request_lock); + kick_pending_request_queues(); + spin_unlock_irq(&io_request_lock); +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.h new file mode 100644 index 0000000000..e41e03970e --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/block.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * block.h + * + * Shared definitions between all levels of XenoLinux Virtual block devices. + */ + +#ifndef __XEN_DRIVERS_BLOCK_H__ +#define __XEN_DRIVERS_BLOCK_H__ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> + +#include <linux/fs.h> +#include <linux/hdreg.h> +#include <linux/blkdev.h> +#include <linux/major.h> + +#include <asm/hypervisor-ifs/hypervisor-if.h> +#include <asm/hypervisor-ifs/vbd.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/uaccess.h> + +#if 0 +#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a ) +#else +#define DPRINTK(_f, _a...) ((void)0) +#endif + +#if 0 +#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a ) +#else +#define DPRINTK_IOCTL(_f, _a...) ((void)0) +#endif + +/* Private gendisk->flags[] values. */ +#define GENHD_FL_XEN 2 /* Is unit a Xen block device? */ +#define GENHD_FL_VIRT_PARTNS 4 /* Are unit partitions virtual? */ + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. + * They hang in an array off the gendisk structure. We may end up putting + * all kinds of interesting stuff here :-) + */ +typedef struct xl_disk { + int usage; +} xl_disk_t; + +extern int xen_control_msg(int operration, char *buffer, int size); +extern int xen_block_open(struct inode *inode, struct file *filep); +extern int xen_block_release(struct inode *inode, struct file *filep); +extern int xen_block_ioctl(struct inode *inode, struct file *filep, + unsigned command, unsigned long argument); +extern int xen_block_check(kdev_t dev); +extern int xen_block_revalidate(kdev_t dev); +extern void do_xlblk_request (request_queue_t *rq); + +extern void xlvbd_update_vbds(void); + +static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev) +{ + struct gendisk *gd = get_gendisk(xldev); + + if ( gd == NULL ) + return NULL; + + return (xl_disk_t *)gd->real_devices + + (MINOR(xldev) >> gd->minor_shift); +} + + +/* Virtual block-device subsystem. */ +extern int xlvbd_init(void); +extern void xlvbd_cleanup(void); + +#endif /* __XEN_DRIVERS_BLOCK_H__ */ diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/block/vbd.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/vbd.c new file mode 100644 index 0000000000..e08b976c56 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/block/vbd.c @@ -0,0 +1,561 @@ +/****************************************************************************** + * vbd.c + * + * Xenolinux virtual block-device driver (xvd). + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + */ + +#include "block.h" +#include <linux/blk.h> + +/* + * For convenience we distinguish between ide, scsi and 'other' (i.e. + * potentially combinations of the two) in the naming scheme and in a few + * other places (like default readahead, etc). + */ +#define XLIDE_MAJOR_NAME "hd" +#define XLSCSI_MAJOR_NAME "sd" +#define XLVBD_MAJOR_NAME "xvd" + +#define XLIDE_DEVS_PER_MAJOR 2 +#define XLSCSI_DEVS_PER_MAJOR 16 +#define XLVBD_DEVS_PER_MAJOR 16 + +#define XLIDE_PARTN_SHIFT 6 /* amount to shift minor to get 'real' minor */ +#define XLIDE_MAX_PART (1 << XLIDE_PARTN_SHIFT) /* minors per ide vbd */ + +#define XLSCSI_PARTN_SHIFT 4 /* amount to shift minor to get 'real' minor */ +#define XLSCSI_MAX_PART (1 << XLSCSI_PARTN_SHIFT) /* minors per scsi vbd */ + +#define XLVBD_PARTN_SHIFT 4 /* amount to shift minor to get 'real' minor */ +#define XLVBD_MAX_PART (1 << XLVBD_PARTN_SHIFT) /* minors per 'other' vbd */ + +/* The below are for the generic drivers/block/ll_rw_block.c code. */ +static int xlide_blksize_size[256]; +static int xlide_hardsect_size[256]; +static int xlide_max_sectors[256]; +static int xlscsi_blksize_size[256]; +static int xlscsi_hardsect_size[256]; +static int xlscsi_max_sectors[256]; +static int xlvbd_blksize_size[256]; +static int xlvbd_hardsect_size[256]; +static int xlvbd_max_sectors[256]; + +/* Information from Xen about our VBDs. */ +#define MAX_VBDS 64 +static int nr_vbds; +static xen_disk_t *vbd_info; + +static struct block_device_operations xlvbd_block_fops = +{ + open: xen_block_open, + release: xen_block_release, + ioctl: xen_block_ioctl, + check_media_change: xen_block_check, + revalidate: xen_block_revalidate, +}; + +static int xlvbd_get_vbd_info(xen_disk_t *disk_info) +{ + int error; + block_io_op_t op; + + /* Probe for disk information. */ + memset(&op, 0, sizeof(op)); + op.cmd = BLOCK_IO_OP_VBD_PROBE; + op.u.probe_params.domain = 0; + op.u.probe_params.xdi.max = MAX_VBDS; + op.u.probe_params.xdi.disks = disk_info; + op.u.probe_params.xdi.count = 0; + + if ( (error = HYPERVISOR_block_io_op(&op)) != 0 ) + { + printk(KERN_ALERT "Could not probe disks (%d)\n", error); + return -1; + } + + return op.u.probe_params.xdi.count; +} + +/* + * xlvbd_init_device - initialise a VBD device + * @disk: a xen_disk_t describing the VBD + * + * Takes a xen_disk_t * that describes a VBD the domain has access to. + * Performs appropriate initialisation and registration of the device. + * + * Care needs to be taken when making re-entrant calls to ensure that + * corruption does not occur. Also, devices that are in use should not have + * their details updated. This is the caller's responsibility. + */ +static int xlvbd_init_device(xen_disk_t *xd) +{ + int device = xd->device; + int major = MAJOR(device); + int minor = MINOR(device); + int is_ide = IDE_DISK_MAJOR(major); /* is this an ide device? */ + int is_scsi= SCSI_BLK_MAJOR(major); /* is this a scsi device? */ + char *major_name; + struct gendisk *gd; + struct block_device *bd; + xl_disk_t *disk; + int i, rc = 0, max_part, partno; + unsigned long capacity; + + unsigned char buf[64]; + + if ( (bd = bdget(device)) == NULL ) + return -1; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((disk = xldev_to_xldisk(device)) != NULL) && (disk->usage != 0) ) + { + printk(KERN_ALERT "VBD update failed - in use [dev=%x]\n", device); + rc = -1; + goto out; + } + + if ( is_ide ) { + + major_name = XLIDE_MAJOR_NAME; + max_part = XLIDE_MAX_PART; + + } else if ( is_scsi ) { + + major_name = XLSCSI_MAJOR_NAME; + max_part = XLSCSI_MAX_PART; + + } else if (XD_VIRTUAL(xd->info)) { + + major_name = XLVBD_MAJOR_NAME; + max_part = XLVBD_MAX_PART; + + } else { + + /* SMH: hmm - probably a CCISS driver or sim; assume CCISS for now */ + printk(KERN_ALERT "Assuming device %02x:%02x is CCISS/SCSI\n", + major, minor); + is_scsi = 1; + major_name = "cciss"; + max_part = XLSCSI_MAX_PART; + + } + + partno = minor & (max_part - 1); + + if ( (gd = get_gendisk(device)) == NULL ) + { + rc = register_blkdev(major, major_name, &xlvbd_block_fops); + if ( rc < 0 ) + { + printk(KERN_ALERT "XL VBD: can't get major %d\n", major); + goto out; + } + + if ( is_ide ) + { + blksize_size[major] = xlide_blksize_size; + hardsect_size[major] = xlide_hardsect_size; + max_sectors[major] = xlide_max_sectors; + read_ahead[major] = 8; /* from drivers/ide/ide-probe.c */ + } + else if ( is_scsi ) + { + blksize_size[major] = xlscsi_blksize_size; + hardsect_size[major] = xlscsi_hardsect_size; + max_sectors[major] = xlscsi_max_sectors; + read_ahead[major] = 0; /* XXX 8; -- guessing */ + } + else + { + blksize_size[major] = xlvbd_blksize_size; + hardsect_size[major] = xlvbd_hardsect_size; + max_sectors[major] = xlvbd_max_sectors; + read_ahead[major] = 8; + } + + blk_init_queue(BLK_DEFAULT_QUEUE(major), do_xlblk_request); + + /* + * Turn off barking 'headactive' mode. We dequeue buffer heads as + * soon as we pass them down to Xen. + */ + blk_queue_headactive(BLK_DEFAULT_QUEUE(major), 0); + + /* Construct an appropriate gendisk structure. */ + gd = kmalloc(sizeof(struct gendisk), GFP_KERNEL); + gd->major = major; + gd->major_name = major_name; + + gd->max_p = max_part; + if ( is_ide ) + { + gd->minor_shift = XLIDE_PARTN_SHIFT; + gd->nr_real = XLIDE_DEVS_PER_MAJOR; + } + else if ( is_scsi ) + { + gd->minor_shift = XLSCSI_PARTN_SHIFT; + gd->nr_real = XLSCSI_DEVS_PER_MAJOR; + } + else + { + gd->minor_shift = XLVBD_PARTN_SHIFT; + gd->nr_real = XLVBD_DEVS_PER_MAJOR; + } + + /* + ** The sizes[] and part[] arrays hold the sizes and other + ** information about every partition with this 'major' (i.e. + ** every disk sharing the 8 bit prefix * max partns per disk) + */ + gd->sizes = kmalloc(max_part*gd->nr_real*sizeof(int), GFP_KERNEL); + gd->part = kmalloc(max_part*gd->nr_real*sizeof(struct hd_struct), + GFP_KERNEL); + memset(gd->sizes, 0, max_part * gd->nr_real * sizeof(int)); + memset(gd->part, 0, max_part * gd->nr_real + * sizeof(struct hd_struct)); + + + gd->real_devices = kmalloc(gd->nr_real * sizeof(xl_disk_t), + GFP_KERNEL); + memset(gd->real_devices, 0, gd->nr_real * sizeof(xl_disk_t)); + + gd->next = NULL; + gd->fops = &xlvbd_block_fops; + + gd->de_arr = kmalloc(gd->nr_real * sizeof(*gd->de_arr), + GFP_KERNEL); + gd->flags = kmalloc(gd->nr_real * sizeof(*gd->flags), GFP_KERNEL); + + memset(gd->de_arr, 0, gd->nr_real * sizeof(*gd->de_arr)); + memset(gd->flags, 0, gd->nr_real * sizeof(*gd->flags)); + + add_gendisk(gd); + + blk_size[major] = gd->sizes; + } + + if ( XD_READONLY(xd->info) ) + set_device_ro(device, 1); + + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_XEN; + + /* NB. Linux 2.4 only handles 32-bit sector offsets and capacities. */ + capacity = (unsigned long)xd->capacity; + + if ( partno != 0 ) + { + /* + * If this was previously set up as a real disc we will have set + * up partition-table information. Virtual partitions override + * 'real' partitions, and the two cannot coexist on a device. + */ + if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) && + (gd->sizes[minor & ~(max_part-1)] != 0) ) + { + /* + * Any non-zero sub-partition entries must be cleaned out before + * installing 'virtual' partition entries. The two types cannot + * coexist, and virtual partitions are favoured. + */ + kdev_t dev = device & ~(max_part-1); + for ( i = max_part - 1; i > 0; i-- ) + { + invalidate_device(dev+i, 1); + gd->part[MINOR(dev+i)].start_sect = 0; + gd->part[MINOR(dev+i)].nr_sects = 0; + gd->sizes[MINOR(dev+i)] = 0; + } + printk(KERN_ALERT + "Virtual partitions found for /dev/%s - ignoring any " + "real partition information we may have found.\n", + disk_name(gd, MINOR(device), buf)); + } + + /* Need to skankily setup 'partition' information */ + gd->part[minor].start_sect = 0; + gd->part[minor].nr_sects = capacity; + gd->sizes[minor] = capacity; + + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS; + } + else + { + gd->part[minor].nr_sects = capacity; + gd->sizes[minor] = capacity>>(BLOCK_SIZE_BITS-9); + + /* Some final fix-ups depending on the device type */ + switch ( XD_TYPE(xd->info) ) + { + case XD_TYPE_CDROM: + case XD_TYPE_FLOPPY: + case XD_TYPE_TAPE: + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_REMOVABLE; + printk(KERN_ALERT + "Skipping partition check on %s /dev/%s\n", + XD_TYPE(xd->info)==XD_TYPE_CDROM ? "cdrom" : + (XD_TYPE(xd->info)==XD_TYPE_TAPE ? "tape" : + "floppy"), disk_name(gd, MINOR(device), buf)); + break; + + case XD_TYPE_DISK: + /* Only check partitions on real discs (not virtual!). */ + if ( gd->flags[minor>>gd->minor_shift] & GENHD_FL_VIRT_PARTNS ) + { + printk(KERN_ALERT + "Skipping partition check on virtual /dev/%s\n", + disk_name(gd, MINOR(device), buf)); + break; + } + register_disk(gd, device, gd->max_p, &xlvbd_block_fops, capacity); + break; + + default: + printk(KERN_ALERT "XenoLinux: unknown device type %d\n", + XD_TYPE(xd->info)); + break; + } + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + + +/* + * xlvbd_remove_device - remove a device node if possible + * @device: numeric device ID + * + * Updates the gendisk structure and invalidates devices. + * + * This is OK for now but in future, should perhaps consider where this should + * deallocate gendisks / unregister devices. + */ +static int xlvbd_remove_device(int device) +{ + int i, rc = 0, minor = MINOR(device); + struct gendisk *gd; + struct block_device *bd; + xl_disk_t *disk = NULL; + + if ( (bd = bdget(device)) == NULL ) + return -1; + + /* + * Update of partition info, and check of usage count, is protected + * by the per-block-device semaphore. + */ + down(&bd->bd_sem); + + if ( ((gd = get_gendisk(device)) == NULL) || + ((disk = xldev_to_xldisk(device)) == NULL) ) + BUG(); + + if ( disk->usage != 0 ) + { + printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device); + rc = -1; + goto out; + } + + if ( (minor & (gd->max_p-1)) != 0 ) + { + /* 1: The VBD is mapped to a partition rather than a whole unit. */ + invalidate_device(device, 1); + gd->part[minor].start_sect = 0; + gd->part[minor].nr_sects = 0; + gd->sizes[minor] = 0; + + /* Clear the consists-of-virtual-partitions flag if possible. */ + gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS; + for ( i = 1; i < gd->max_p; i++ ) + if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 ) + gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS; + + /* + * If all virtual partitions are now gone, and a 'whole unit' VBD is + * present, then we can try to grok the unit's real partition table. + */ + if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) && + (gd->sizes[minor & ~(gd->max_p-1)] != 0) && + !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) ) + { + register_disk(gd, + device&~(gd->max_p-1), + gd->max_p, + &xlvbd_block_fops, + gd->part[minor&~(gd->max_p-1)].nr_sects); + } + } + else + { + /* + * 2: The VBD is mapped to an entire 'unit'. Clear all partitions. + * NB. The partition entries are only cleared if there are no VBDs + * mapped to individual partitions on this unit. + */ + i = gd->max_p - 1; /* Default: clear subpartitions as well. */ + if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS ) + i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */ + while ( i >= 0 ) + { + invalidate_device(device+i, 1); + gd->part[minor+i].start_sect = 0; + gd->part[minor+i].nr_sects = 0; + gd->sizes[minor+i] = 0; + i--; + } + } + + out: + up(&bd->bd_sem); + bdput(bd); + return rc; +} + +/* + * xlvbd_update_vbds - reprobes the VBD status and performs updates driver + * state. The VBDs need to be updated in this way when the domain is + * initialised and also each time we receive an XLBLK_UPDATE event. + */ +void xlvbd_update_vbds(void) +{ + int i, j, k, old_nr, new_nr; + xen_disk_t *old_info, *new_info, *merged_info; + + old_info = vbd_info; + old_nr = nr_vbds; + + new_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL); + if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 ) + { + kfree(new_info); + return; + } + + /* + * Final list maximum size is old list + new list. This occurs only when + * old list and new list do not overlap at all, and we cannot yet destroy + * VBDs in the old list because the usage counts are busy. + */ + merged_info = kmalloc((old_nr + new_nr) * sizeof(xen_disk_t), GFP_KERNEL); + + /* @i tracks old list; @j tracks new list; @k tracks merged list. */ + i = j = k = 0; + + while ( (i < old_nr) && (j < new_nr) ) + { + if ( old_info[i].device < new_info[j].device ) + { + if ( xlvbd_remove_device(old_info[i].device) != 0 ) + memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t)); + i++; + } + else if ( old_info[i].device > new_info[j].device ) + { + if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t)); + j++; + } + else + { + if ( ((old_info[i].capacity == new_info[j].capacity) && + (old_info[i].info == new_info[j].info)) || + (xlvbd_remove_device(old_info[i].device) != 0) ) + memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t)); + else if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t)); + i++; j++; + } + } + + for ( ; i < old_nr; i++ ) + { + if ( xlvbd_remove_device(old_info[i].device) != 0 ) + memcpy(&merged_info[k++], &old_info[i], sizeof(xen_disk_t)); + } + + for ( ; j < new_nr; j++ ) + { + if ( xlvbd_init_device(&new_info[j]) == 0 ) + memcpy(&merged_info[k++], &new_info[j], sizeof(xen_disk_t)); + } + + vbd_info = merged_info; + nr_vbds = k; + + kfree(old_info); + kfree(new_info); +} + + +/* + * Set up all the linux device goop for the virtual block devices (vbd's) that + * xen tells us about. Note that although from xen's pov VBDs are addressed + * simply an opaque 16-bit device number, the domain creation tools + * conventionally allocate these numbers to correspond to those used by 'real' + * linux -- this is just for convenience as it means e.g. that the same + * /etc/fstab can be used when booting with or without xen. + */ +int __init xlvbd_init(void) +{ + int i; + + /* + * If compiled as a module, we don't support unloading yet. We therefore + * permanently increment the reference count to disallow it. + */ + SET_MODULE_OWNER(&xlvbd_block_fops); + MOD_INC_USE_COUNT; + + /* Initialize the global arrays. */ + for ( i = 0; i < 256; i++ ) + { + /* from the generic ide code (drivers/ide/ide-probe.c, etc) */ + xlide_blksize_size[i] = 1024; + xlide_hardsect_size[i] = 512; + xlide_max_sectors[i] = 128; /* 'hwif->rqsize' if we knew it */ + + /* from the generic scsi disk code (drivers/scsi/sd.c) */ + xlscsi_blksize_size[i] = 1024; /* XXX 512; */ + xlscsi_hardsect_size[i] = 512; + xlscsi_max_sectors[i] = 128*8; /* XXX 128; */ + + /* we don't really know what to set these too since it depends */ + xlvbd_blksize_size[i] = 512; + xlvbd_hardsect_size[i] = 512; + xlvbd_max_sectors[i] = 128; + } + + vbd_info = kmalloc(MAX_VBDS * sizeof(xen_disk_t), GFP_KERNEL); + nr_vbds = xlvbd_get_vbd_info(vbd_info); + + if ( nr_vbds < 0 ) + { + kfree(vbd_info); + vbd_info = NULL; + nr_vbds = 0; + } + else + { + for ( i = 0; i < nr_vbds; i++ ) + xlvbd_init_device(&vbd_info[i]); + } + + return 0; +} + + +#ifdef MODULE +module_init(xlvbd_init); +#endif diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/console/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/Makefile new file mode 100644 index 0000000000..aaa546a8f3 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-$(CONFIG_XEN_CONSOLE) := console.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/console/console.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/console.c new file mode 100644 index 0000000000..91b97e83d9 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/console/console.c @@ -0,0 +1,625 @@ +/****************************************************************************** + * console.c + * + * Virtual console driver. + * + * Copyright (c) 2002-2004, K A Fraser. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/tty.h> +#include <linux/tty_flip.h> +#include <linux/serial.h> +#include <linux/major.h> +#include <linux/ptrace.h> +#include <linux/ioport.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/console.h> +#include <asm/evtchn.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/uaccess.h> +#include <asm/hypervisor.h> +#include <asm/hypervisor-ifs/event_channel.h> +#include <asm/ctrl_if.h> + +/* + * Modes: + * 'xencons=off' [XC_OFF]: Console is disabled. + * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'. + * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'. + * [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY. + * + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses + * warnings from standard distro startup scripts. + */ +static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT; + +static int __init xencons_setup(char *str) +{ + if ( !strcmp(str, "tty") ) + xc_mode = XC_TTY; + else if ( !strcmp(str, "ttyS") ) + xc_mode = XC_SERIAL; + else if ( !strcmp(str, "off") ) + xc_mode = XC_OFF; + return 1; +} +__setup("xencons", xencons_setup); + +/* The kernel and user-land drivers share a common transmit buffer. */ +#define WBUF_SIZE 4096 +#define WBUF_MASK(_i) ((_i)&(WBUF_SIZE-1)) +static char wbuf[WBUF_SIZE]; +static unsigned int wc, wp; /* write_cons, write_prod */ + +/* This lock protects accesses to the common transmit buffer. */ +static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED; + +/* Common transmit-kick routine. */ +static void __xencons_tx_flush(void); + +/* This task is used to defer sending console data until there is space. */ +static void xencons_tx_flush_task_routine(void *data); +static struct tq_struct xencons_tx_flush_task = { + routine: xencons_tx_flush_task_routine +}; + + +/******************** Kernel console driver ********************************/ + +static void kcons_write( + struct console *c, const char *s, unsigned int count) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + + for ( i = 0; i < count; i++ ) + { + if ( (wp - wc) >= (WBUF_SIZE - 1) ) + break; + if ( (wbuf[WBUF_MASK(wp++)] = s[i]) == '\n' ) + wbuf[WBUF_MASK(wp++)] = '\r'; + } + + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void kcons_write_dom0( + struct console *c, const char *s, unsigned int count) +{ + int rc; + + while ( count > 0 ) + { + if ( (rc = HYPERVISOR_console_io(CONSOLEIO_write, count, s)) > 0 ) + { + count -= rc; + s += rc; + } + else + break; + } +} + +static kdev_t kcons_device(struct console *c) +{ + return MKDEV(TTY_MAJOR, (xc_mode == XC_SERIAL) ? 64 : 1); +} + +static struct console kcons_info = { + device: kcons_device, + flags: CON_PRINTBUFFER, + index: -1 +}; + +void xen_console_init(void) +{ + if ( start_info.flags & SIF_INITDOMAIN ) + { + if ( xc_mode == XC_DEFAULT ) + xc_mode = XC_SERIAL; + kcons_info.write = kcons_write_dom0; + } + else + { + if ( xc_mode == XC_DEFAULT ) + xc_mode = XC_TTY; + kcons_info.write = kcons_write; + } + + if ( xc_mode == XC_OFF ) + return; + + if ( xc_mode == XC_SERIAL ) + strcpy(kcons_info.name, "ttyS"); + else + strcpy(kcons_info.name, "tty"); + + register_console(&kcons_info); +} + + +/*** Useful function for console debugging -- goes straight to Xen. ***/ +asmlinkage int xprintk(const char *fmt, ...) +{ + va_list args; + int printk_len; + static char printk_buf[1024]; + + /* Emit the output into the temporary buffer */ + va_start(args, fmt); + printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); + va_end(args); + + /* Send the processed output directly to Xen. */ + kcons_write_dom0(NULL, printk_buf, printk_len); + + return 0; +} + +/*** Forcibly flush console data before dying. ***/ +void xencons_force_flush(void) +{ + ctrl_msg_t msg; + int sz; + + /* Emergency console is synchronous, so there's nothing to flush. */ + if ( start_info.flags & SIF_INITDOMAIN ) + return; + + /* + * We use dangerous control-interface functions that require a quiescent + * system and no interrupts. Try to ensure this with a global cli(). + */ + cli(); + + /* Spin until console data is flushed through to the domain controller. */ + while ( (wc != wp) && !ctrl_if_transmitter_empty() ) + { + /* Interrupts are disabled -- we must manually reap responses. */ + ctrl_if_discard_responses(); + + if ( (sz = wp - wc) == 0 ) + continue; + if ( sz > sizeof(msg.msg) ) + sz = sizeof(msg.msg); + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = sz; + memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) + wc += sz; + } +} + + +/******************** User-space console driver (/dev/console) ************/ + +static struct tty_driver xencons_driver; +static int xencons_refcount; +static struct tty_struct *xencons_table[MAX_NR_CONSOLES]; +static struct termios *xencons_termios[MAX_NR_CONSOLES]; +static struct termios *xencons_termios_locked[MAX_NR_CONSOLES]; +static struct tty_struct *xencons_tty; +static int xencons_priv_irq; +static char x_char; + +/* Non-privileged receive callback. */ +static void xencons_rx(ctrl_msg_t *msg, unsigned long id) +{ + int i; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + if ( xencons_tty != NULL ) + { + for ( i = 0; i < msg->length; i++ ) + tty_insert_flip_char(xencons_tty, msg->msg[i], 0); + tty_flip_buffer_push(xencons_tty); + } + spin_unlock_irqrestore(&xencons_lock, flags); + + msg->length = 0; + ctrl_if_send_response(msg); +} + +/* Privileged and non-privileged transmit worker. */ +static void __xencons_tx_flush(void) +{ + int sz, work_done = 0; + ctrl_msg_t msg; + + if ( start_info.flags & SIF_INITDOMAIN ) + { + if ( x_char ) + { + kcons_write_dom0(NULL, &x_char, 1); + x_char = 0; + work_done = 1; + } + + while ( wc != wp ) + { + sz = wp - wc; + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); + wc += sz; + work_done = 1; + } + } + else + { + while ( x_char ) + { + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = 1; + msg.msg[0] = x_char; + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) + x_char = 0; + else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) ) + break; + + work_done = 1; + } + + while ( wc != wp ) + { + sz = wp - wc; + if ( sz > sizeof(msg.msg) ) + sz = sizeof(msg.msg); + if ( sz > (WBUF_SIZE - WBUF_MASK(wc)) ) + sz = WBUF_SIZE - WBUF_MASK(wc); + + msg.type = CMSG_CONSOLE; + msg.subtype = CMSG_CONSOLE_DATA; + msg.length = sz; + memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); + + if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) + wc += sz; + else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) ) + break; + + work_done = 1; + } + } + + if ( work_done && (xencons_tty != NULL) ) + { + wake_up_interruptible(&xencons_tty->write_wait); + if ( (xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && + (xencons_tty->ldisc.write_wakeup != NULL) ) + (xencons_tty->ldisc.write_wakeup)(xencons_tty); + } +} + +/* Non-privileged transmit kicker. */ +static void xencons_tx_flush_task_routine(void *data) +{ + unsigned long flags; + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +/* Privileged receive callback and transmit kicker. */ +static void xencons_priv_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + static char rbuf[16]; + int i, l; + unsigned long flags; + + spin_lock_irqsave(&xencons_lock, flags); + + if ( xencons_tty != NULL ) + { + /* Receive work. */ + while ( (l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0 ) + for ( i = 0; i < l; i++ ) + tty_insert_flip_char(xencons_tty, rbuf[i], 0); + if ( xencons_tty->flip.count != 0 ) + tty_flip_buffer_push(xencons_tty); + } + + /* Transmit work. */ + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static int xencons_write_room(struct tty_struct *tty) +{ + return WBUF_SIZE - (wp - wc); +} + +static int xencons_chars_in_buffer(struct tty_struct *tty) +{ + return wp - wc; +} + +static void xencons_send_xchar(struct tty_struct *tty, char ch) +{ + unsigned long flags; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + x_char = ch; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_throttle(struct tty_struct *tty) +{ + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + if ( I_IXOFF(tty) ) + xencons_send_xchar(tty, STOP_CHAR(tty)); +} + +static void xencons_unthrottle(struct tty_struct *tty) +{ + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + if ( I_IXOFF(tty) ) + { + if ( x_char != 0 ) + x_char = 0; + else + xencons_send_xchar(tty, START_CHAR(tty)); + } +} + +static void xencons_flush_buffer(struct tty_struct *tty) +{ + unsigned long flags; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + wc = wp = 0; + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static inline int __xencons_put_char(int ch) +{ + char _ch = (char)ch; + if ( (wp - wc) == WBUF_SIZE ) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; +} + +static int xencons_write(struct tty_struct *tty, int from_user, + const u_char * buf, int count) +{ + int i; + unsigned long flags; + + if ( from_user && verify_area(VERIFY_READ, buf, count) ) + return -EINVAL; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return count; + + spin_lock_irqsave(&xencons_lock, flags); + + for ( i = 0; i < count; i++ ) + { + char ch; + if ( from_user ) + __get_user(ch, buf + i); + else + ch = buf[i]; + if ( !__xencons_put_char(ch) ) + break; + } + + if ( i != 0 ) + __xencons_tx_flush(); + + spin_unlock_irqrestore(&xencons_lock, flags); + + return i; +} + +static void xencons_put_char(struct tty_struct *tty, u_char ch) +{ + unsigned long flags; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + (void)__xencons_put_char(ch); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_flush_chars(struct tty_struct *tty) +{ + unsigned long flags; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + spin_lock_irqsave(&xencons_lock, flags); + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); +} + +static void xencons_wait_until_sent(struct tty_struct *tty, int timeout) +{ + unsigned long orig_jiffies = jiffies; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + while ( tty->driver.chars_in_buffer(tty) ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + if ( signal_pending(current) ) + break; + if ( (timeout != 0) && time_after(jiffies, orig_jiffies + timeout) ) + break; + } + + set_current_state(TASK_RUNNING); +} + +static int xencons_open(struct tty_struct *tty, struct file *filp) +{ + int line; + unsigned long flags; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return 0; + + MOD_INC_USE_COUNT; + line = MINOR(tty->device) - tty->driver.minor_start; + if ( line != 0 ) + { + MOD_DEC_USE_COUNT; + return -ENODEV; + } + + spin_lock_irqsave(&xencons_lock, flags); + tty->driver_data = NULL; + if ( xencons_tty == NULL ) + xencons_tty = tty; + __xencons_tx_flush(); + spin_unlock_irqrestore(&xencons_lock, flags); + + return 0; +} + +static void xencons_close(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + + if ( MINOR(tty->device) != xencons_driver.minor_start ) + return; + + if ( tty->count == 1 ) + { + tty->closing = 1; + tty_wait_until_sent(tty, 0); + if ( tty->driver.flush_buffer != NULL ) + tty->driver.flush_buffer(tty); + if ( tty->ldisc.flush_buffer != NULL ) + tty->ldisc.flush_buffer(tty); + tty->closing = 0; + spin_lock_irqsave(&xencons_lock, flags); + xencons_tty = NULL; + spin_unlock_irqrestore(&xencons_lock, flags); + } + + MOD_DEC_USE_COUNT; +} + +static int __init xencons_init(void) +{ + memset(&xencons_driver, 0, sizeof(struct tty_driver)); + xencons_driver.magic = TTY_DRIVER_MAGIC; + xencons_driver.major = TTY_MAJOR; + xencons_driver.type = TTY_DRIVER_TYPE_SERIAL; + xencons_driver.subtype = SERIAL_TYPE_NORMAL; + xencons_driver.init_termios = tty_std_termios; + xencons_driver.flags = + TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_NO_DEVFS; + xencons_driver.refcount = &xencons_refcount; + xencons_driver.table = xencons_table; + xencons_driver.termios = xencons_termios; + xencons_driver.termios_locked = xencons_termios_locked; + + if ( xc_mode == XC_OFF ) + return 0; + + if ( xc_mode == XC_SERIAL ) + { + xencons_driver.name = "ttyS"; + xencons_driver.minor_start = 64; + xencons_driver.num = 1; + } + else + { + xencons_driver.name = "tty"; + xencons_driver.minor_start = 1; + xencons_driver.num = MAX_NR_CONSOLES; + } + + xencons_driver.open = xencons_open; + xencons_driver.close = xencons_close; + xencons_driver.write = xencons_write; + xencons_driver.write_room = xencons_write_room; + xencons_driver.put_char = xencons_put_char; + xencons_driver.flush_chars = xencons_flush_chars; + xencons_driver.chars_in_buffer = xencons_chars_in_buffer; + xencons_driver.send_xchar = xencons_send_xchar; + xencons_driver.flush_buffer = xencons_flush_buffer; + xencons_driver.throttle = xencons_throttle; + xencons_driver.unthrottle = xencons_unthrottle; + xencons_driver.wait_until_sent = xencons_wait_until_sent; + + if ( tty_register_driver(&xencons_driver) ) + panic("Couldn't register Xen virtual console driver\n"); + + if ( start_info.flags & SIF_INITDOMAIN ) + { + xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE); + (void)request_irq(xencons_priv_irq, + xencons_priv_interrupt, 0, "console", NULL); + } + else + { + (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0); + } + + printk("Xen virtual console successfully installed\n"); + + return 0; +} + +static void __exit xencons_fini(void) +{ + int ret; + + if ( (ret = tty_unregister_driver(&xencons_driver)) != 0 ) + printk(KERN_ERR "Unable to unregister Xen console driver: %d\n", ret); + + if ( start_info.flags & SIF_INITDOMAIN ) + { + free_irq(xencons_priv_irq, NULL); + unbind_virq_from_irq(VIRQ_CONSOLE); + } + else + { + ctrl_if_unregister_receiver(CMSG_CONSOLE, xencons_rx); + } +} + +module_init(xencons_init); +module_exit(xencons_fini); diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/Makefile new file mode 100644 index 0000000000..3e2e17bd23 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := core.o vfr.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/core.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/core.c new file mode 100644 index 0000000000..5412a28572 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/core.c @@ -0,0 +1,233 @@ +/****************************************************************************** + * core.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/swapctl.h> +#include <linux/iobuf.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> +#include <asm/proc_cmd.h> +#include <asm/hypervisor-ifs/dom0_ops.h> +#include <asm/xen_proc.h> + +static struct proc_dir_entry *privcmd_intf; + +static int privcmd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long data) +{ + int ret = -ENOSYS; + + switch ( cmd ) + { + case IOCTL_PRIVCMD_HYPERCALL: + { + privcmd_hypercall_t hypercall; + + if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) ) + return -EFAULT; + + __asm__ __volatile__ ( + "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; " + "movl 4(%%eax),%%ebx ;" + "movl 8(%%eax),%%ecx ;" + "movl 12(%%eax),%%edx ;" + "movl 16(%%eax),%%esi ;" + "movl 20(%%eax),%%edi ;" + "movl (%%eax),%%eax ;" + TRAP_INSTR "; " + "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx" + : "=a" (ret) : "0" (&hypercall) : "memory" ); + + } + break; + + case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN: + { + extern int initdom_ctrlif_domcontroller_port; + ret = initdom_ctrlif_domcontroller_port; + } + break; + + + case IOCTL_PRIVCMD_MMAP: + { +#define PRIVCMD_MMAP_SZ 32 + privcmd_mmap_t mmapcmd; + privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ], *p; + int i, rc; + + if ( copy_from_user(&mmapcmd, (void *)data, sizeof(mmapcmd)) ) + return -EFAULT; + + p = mmapcmd.entry; + + for (i=0; i<mmapcmd.num; i+=PRIVCMD_MMAP_SZ, p+=PRIVCMD_MMAP_SZ) + { + int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)? + PRIVCMD_MMAP_SZ:(mmapcmd.num-i); + if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) ) + return -EFAULT; + + for ( j = 0; j < n; j++ ) + { + struct vm_area_struct *vma = + find_vma( current->mm, msg[j].va ); + + if ( !vma ) + return -EINVAL; + + if ( msg[j].va > PAGE_OFFSET ) + return -EINVAL; + + if ( (msg[j].va + (msg[j].npages<<PAGE_SHIFT)) > vma->vm_end ) + return -EINVAL; + + if ( (rc = direct_remap_area_pages(vma->vm_mm, + msg[j].va&PAGE_MASK, + msg[j].mfn<<PAGE_SHIFT, + msg[j].npages<<PAGE_SHIFT, + vma->vm_page_prot, + mmapcmd.dom)) < 0 ) + return rc; + } + } + ret = 0; + } + break; + + case IOCTL_PRIVCMD_MMAPBATCH: + { +#define MAX_DIRECTMAP_MMU_QUEUE 130 + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v; + privcmd_mmapbatch_t m; + struct vm_area_struct *vma = NULL; + unsigned long *p, addr; + unsigned long mfn; + int i; + + if ( copy_from_user(&m, (void *)data, sizeof(m)) ) + { ret = -EFAULT; goto batch_err; } + + vma = find_vma( current->mm, m.addr ); + + if ( !vma ) + { ret = -EINVAL; goto batch_err; } + + if ( m.addr > PAGE_OFFSET ) + { ret = -EFAULT; goto batch_err; } + + if ( (m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end ) + { ret = -EFAULT; goto batch_err; } + + if ( m.dom != 0 ) + { + u[0].val = (unsigned long)(m.dom<<16) & ~0xFFFFUL; + u[0].ptr = (unsigned long)(m.dom<< 0) & ~0xFFFFUL; + u[0].ptr |= MMU_EXTENDED_COMMAND; + u[0].val |= MMUEXT_SET_SUBJECTDOM; + v = w = &u[1]; + } + else + { + v = w = &u[0]; + } + + p = m.arr; + addr = m.addr; + for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ ) + { + if ( get_user(mfn, p) ) return -EFAULT; + + v->val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot) | + _PAGE_IO; + + __direct_remap_area_pages(vma->vm_mm, + addr, + PAGE_SIZE, + v); + + if ( unlikely(HYPERVISOR_mmu_update(u, v - u + 1, NULL) < 0) ) + put_user( 0xF0000000 | mfn, p ); + + v = w; + } + ret = 0; + break; + + batch_err: + printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n", + ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end); + break; + } + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) +{ + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + + return 0; +} + +static struct file_operations privcmd_file_ops = { + ioctl : privcmd_ioctl, + mmap: privcmd_mmap +}; + + +static int __init init_module(void) +{ + if ( !(start_info.flags & SIF_PRIVILEGED) ) + return 0; + + privcmd_intf = create_xen_proc_entry("privcmd", 0400); + if ( privcmd_intf != NULL ) + { + privcmd_intf->owner = THIS_MODULE; + privcmd_intf->nlink = 1; + privcmd_intf->proc_fops = &privcmd_file_ops; + } + + return 0; +} + + +static void __exit cleanup_module(void) +{ + if ( privcmd_intf == NULL ) return; + remove_xen_proc_entry("privcmd"); + privcmd_intf = NULL; +} + + +module_init(init_module); +module_exit(cleanup_module); +# diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/vfr.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/vfr.c new file mode 100644 index 0000000000..9d8ca0a32d --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/dom0/vfr.c @@ -0,0 +1,343 @@ +/****************************************************************************** + * vfr.c + * + * Interface to the virtual firewall/router. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <asm/xen_proc.h> +#include <asm/hypervisor-ifs/network.h> + +static struct proc_dir_entry *proc_vfr; + +static unsigned char readbuf[1024]; + +/* Helpers, implemented at the bottom. */ +u32 getipaddr(const char *buff, unsigned int len); +u16 antous(const char *buff, int len); +u64 antoull(const char *buff, int len); +int anton(const char *buff, int len); + +static int vfr_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + strcpy(page, readbuf); + *readbuf = '\0'; + *eof = 1; + *start = page; + return strlen(page); +} + +/* The format for the vfr interface is as follows: + * + * COMMAND <field>=<val> [<field>=<val> [...]] + * + * where: + * + * COMMAND = { ACCEPT | COUNT } + * + * field=val pairs are as follows: + * + * field = { srcaddr | dstaddr } + * val is a dot seperated, numeric IP address. + * + * field = { srcport | dstport } + * val is a (16-bit) unsigned int + * + * field = { proto } + * val = { IP | TCP | UDP | ARP } + * + */ + +#define isspace(_x) ( ((_x)==' ') || ((_x)=='\t') || ((_x)=='\v') || \ + ((_x)=='\f') || ((_x)=='\r') || ((_x)=='\n') ) + +static int vfr_write_proc(struct file *file, const char *buffer, + u_long count, void *data) +{ + network_op_t op; + int ret, len; + int ts, te, tl; // token start, end, and length + int fs, fe, fl; // field. + + len = count; + ts = te = 0; + + memset(&op, 0, sizeof(network_op_t)); + + // get the command: + while ( count && isspace(buffer[ts]) ) { ts++; count--; } // skip spaces. + te = ts; + while ( count && !isspace(buffer[te]) ) { te++; count--; } // command end + if ( te <= ts ) goto bad; + tl = te - ts; + + if ( strncmp(&buffer[ts], "ADD", tl) == 0 ) + { + op.cmd = NETWORK_OP_ADDRULE; + } + else if ( strncmp(&buffer[ts], "DELETE", tl) == 0 ) + { + op.cmd = NETWORK_OP_DELETERULE; + } + else if ( strncmp(&buffer[ts], "PRINT", tl) == 0 ) + { + op.cmd = NETWORK_OP_GETRULELIST; + goto doneparsing; + } + + ts = te; + + // get the action + while ( count && (buffer[ts] == ' ') ) { ts++; count--; } // skip spaces. + te = ts; + while ( count && (buffer[te] != ' ') ) { te++; count--; } // command end + if ( te <= ts ) goto bad; + tl = te - ts; + + if ( strncmp(&buffer[ts], "ACCEPT", tl) == 0 ) + { + op.u.net_rule.action = NETWORK_ACTION_ACCEPT; + goto keyval; + } + if ( strncmp(&buffer[ts], "COUNT", tl) == 0 ) + { + op.u.net_rule.action = NETWORK_ACTION_COUNT; + goto keyval; + } + + // default case; + return (len); + + + // get the key=val pairs. + keyval: + while (count) + { + //get field + ts = te; while ( count && isspace(buffer[ts]) ) { ts++; count--; } + te = ts; + while ( count && !isspace(buffer[te]) && (buffer[te] != '=') ) + { te++; count--; } + if ( te <= ts ) + goto doneparsing; + tl = te - ts; + fs = ts; fe = te; fl = tl; // save the field markers. + // skip " = " (ignores extra equals.) + while ( count && (isspace(buffer[te]) || (buffer[te] == '=')) ) + { te++; count--; } + ts = te; + while ( count && !isspace(buffer[te]) ) { te++; count--; } + tl = te - ts; + + if ( (fl <= 0) || (tl <= 0) ) goto bad; + + /* NB. Prefix matches must go first! */ + if (strncmp(&buffer[fs], "src", fl) == 0) + { + op.u.net_rule.src_dom = VIF_SPECIAL; + op.u.net_rule.src_idx = VIF_ANY_INTERFACE; + } + else if (strncmp(&buffer[fs], "dst", fl) == 0) + { + op.u.net_rule.dst_dom = VIF_SPECIAL; + op.u.net_rule.dst_idx = VIF_PHYSICAL_INTERFACE; + } + else if (strncmp(&buffer[fs], "srcaddr", fl) == 0) + { + op.u.net_rule.src_addr = getipaddr(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "dstaddr", fl) == 0) + { + op.u.net_rule.dst_addr = getipaddr(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "srcaddrmask", fl) == 0) + { + op.u.net_rule.src_addr_mask = getipaddr(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "dstaddrmask", fl) == 0) + { + op.u.net_rule.dst_addr_mask = getipaddr(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "srcport", fl) == 0) + { + op.u.net_rule.src_port = antous(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "dstport", fl) == 0) + { + op.u.net_rule.dst_port = antous(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "srcportmask", fl) == 0) + { + op.u.net_rule.src_port_mask = antous(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "dstportmask", fl) == 0) + { + op.u.net_rule.dst_port_mask = antous(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "srcdom", fl) == 0) + { + op.u.net_rule.src_dom = antoull(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "srcidx", fl) == 0) + { + op.u.net_rule.src_idx = anton(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "dstdom", fl) == 0) + { + op.u.net_rule.dst_dom = antoull(&buffer[ts], tl); + } + else if (strncmp(&buffer[fs], "dstidx", fl) == 0) + { + op.u.net_rule.dst_idx = anton(&buffer[ts], tl); + } + else if ( (strncmp(&buffer[fs], "proto", fl) == 0)) + { + if (strncmp(&buffer[ts], "any", tl) == 0) + op.u.net_rule.proto = NETWORK_PROTO_ANY; + if (strncmp(&buffer[ts], "ip", tl) == 0) + op.u.net_rule.proto = NETWORK_PROTO_IP; + if (strncmp(&buffer[ts], "tcp", tl) == 0) + op.u.net_rule.proto = NETWORK_PROTO_TCP; + if (strncmp(&buffer[ts], "udp", tl) == 0) + op.u.net_rule.proto = NETWORK_PROTO_UDP; + if (strncmp(&buffer[ts], "arp", tl) == 0) + op.u.net_rule.proto = NETWORK_PROTO_ARP; + } + } + + doneparsing: + ret = HYPERVISOR_network_op(&op); + return(len); + + bad: + return(len); + + +} + +static int __init init_module(void) +{ + if ( !(start_info.flags & SIF_PRIVILEGED) ) + return 0; + + *readbuf = '\0'; + proc_vfr = create_xen_proc_entry("vfr", 0600); + if ( proc_vfr != NULL ) + { + proc_vfr->owner = THIS_MODULE; + proc_vfr->nlink = 1; + proc_vfr->read_proc = vfr_read_proc; + proc_vfr->write_proc = vfr_write_proc; + printk("Successfully installed virtual firewall/router interface\n"); + } + return 0; +} + +static void __exit cleanup_module(void) +{ + if ( proc_vfr == NULL ) return; + remove_xen_proc_entry("vfr"); + proc_vfr = NULL; +} + +module_init(init_module); +module_exit(cleanup_module); + +/* Helper functions start here: */ + +int anton(const char *buff, int len) +{ + int ret; + char c; + int sign = 1; + + ret = 0; + + if (len == 0) return 0; + if (*buff == '-') { sign = -1; buff++; len--; } + + while ( (len) && ((c = *buff) >= '0') && (c <= '9') ) + { + ret *= 10; + ret += c - '0'; + buff++; len--; + } + + ret *= sign; + return ret; +} + +u16 antous(const char *buff, int len) +{ + u16 ret; + char c; + + ret = 0; + + while ( (len) && ((c = *buff) >= '0') && (c <= '9') ) + { + ret *= 10; + ret += c - '0'; + buff++; len--; + } + + return ret; +} + +u64 antoull(const char *buff, int len) +{ + u64 ret; + char c; + + ret = 0; + + while ( (len) && ((c = *buff) >= '0') && (c <= '9') ) + { + ret *= 10; + ret += c - '0'; + buff++; len--; + } + + return ret; +} + +u32 getipaddr(const char *buff, unsigned int len) +{ + char c; + u32 ret, val; + + ret = 0; val = 0; + + while ( len ) + { + if (!((((c = *buff) >= '0') && ( c <= '9')) || ( c == '.' ) ) ) + { + return(0); // malformed. + } + + if ( c == '.' ) { + if (val > 255) return (0); //malformed. + ret = ret << 8; + ret += val; + val = 0; + len--; buff++; + continue; + } + val *= 10; + val += c - '0'; + buff++; len--; + } + ret = ret << 8; + ret += val; + + return (ret); +} + diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/Makefile new file mode 100644 index 0000000000..61c983f625 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := evtchn.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/evtchn.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/evtchn.c new file mode 100644 index 0000000000..985d72821d --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/evtchn/evtchn.c @@ -0,0 +1,372 @@ +/****************************************************************************** + * evtchn.c + * + * Xenolinux driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004, K A Fraser + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/miscdevice.h> +#include <linux/major.h> +#include <linux/proc_fs.h> +#include <linux/devfs_fs_kernel.h> +#include <linux/stat.h> +#include <linux/poll.h> +#include <linux/irq.h> +#include <asm/evtchn.h> + +/* NB. This must be shared amongst drivers if more things go in /dev/xen */ +static devfs_handle_t xen_dev_dir; + +/* Only one process may open /dev/xen/evtchn at any time. */ +static unsigned long evtchn_dev_inuse; + +/* Notification ring, accessed via /dev/xen/evtchn. */ +#define RING_SIZE 2048 /* 2048 16-bit entries */ +#define RING_MASK(_i) ((_i)&(RING_SIZE-1)) +static u16 *ring; +static unsigned int ring_cons, ring_prod, ring_overflow; + +/* Processes wait on this queue via /dev/xen/evtchn when ring is empty. */ +static DECLARE_WAIT_QUEUE_HEAD(evtchn_wait); +static struct fasync_struct *evtchn_async_queue; + +/* Which ports is user-space bound to? */ +static u32 bound_ports[32]; + +static spinlock_t lock; + +void evtchn_device_upcall(int port) +{ + u16 port_subtype; + shared_info_t *s = HYPERVISOR_shared_info; + + spin_lock(&lock); + + mask_evtchn(port); + clear_evtchn(port); + + if ( likely(!synch_test_and_clear_bit(port, &s->evtchn_exception[0])) ) + port_subtype = PORT_NORMAL; + else + port_subtype = PORT_EXCEPTION; + + if ( ring != NULL ) + { + if ( (ring_prod - ring_cons) < RING_SIZE ) + { + ring[RING_MASK(ring_prod)] = (u16)port | port_subtype; + if ( ring_cons == ring_prod++ ) + { + wake_up_interruptible(&evtchn_wait); + kill_fasync(&evtchn_async_queue, SIGIO, POLL_IN); + } + } + else + { + ring_overflow = 1; + } + } + + spin_unlock(&lock); +} + +static void __evtchn_reset_buffer_ring(void) +{ + /* Initialise the ring to empty. Clear errors. */ + ring_cons = ring_prod = ring_overflow = 0; +} + +static ssize_t evtchn_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&evtchn_wait, &wait); + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) + { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + for ( ; ; ) + { + set_current_state(TASK_INTERRUPTIBLE); + + if ( (c = ring_cons) != (p = ring_prod) ) + break; + + if ( ring_overflow ) + { + rc = -EFBIG; + goto out; + } + + if ( file->f_flags & O_NONBLOCK ) + { + rc = -EAGAIN; + goto out; + } + + if ( signal_pending(current) ) + { + rc = -ERESTARTSYS; + goto out; + } + + schedule(); + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if ( ((c ^ p) & RING_SIZE) != 0 ) + { + bytes1 = (RING_SIZE - RING_MASK(c)) * sizeof(u16); + bytes2 = RING_MASK(p) * sizeof(u16); + } + else + { + bytes1 = (p - c) * sizeof(u16); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if ( bytes1 > count ) + { + bytes1 = count; + bytes2 = 0; + } + else if ( (bytes1 + bytes2) > count ) + { + bytes2 = count - bytes1; + } + + if ( copy_to_user(buf, &ring[RING_MASK(c)], bytes1) || + ((bytes2 != 0) && copy_to_user(&buf[bytes1], &ring[0], bytes2)) ) + { + rc = -EFAULT; + goto out; + } + + ring_cons += (bytes1 + bytes2) / sizeof(u16); + + rc = bytes1 + bytes2; + + out: + __set_current_state(TASK_RUNNING); + remove_wait_queue(&evtchn_wait, &wait); + return rc; +} + +static ssize_t evtchn_write(struct file *file, const char *buf, + size_t count, loff_t *ppos) +{ + int rc, i; + u16 *kbuf = (u16 *)get_free_page(GFP_KERNEL); + + if ( kbuf == NULL ) + return -ENOMEM; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) + { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + if ( copy_from_user(kbuf, buf, count) != 0 ) + { + rc = -EFAULT; + goto out; + } + + spin_lock_irq(&lock); + for ( i = 0; i < (count/2); i++ ) + if ( test_bit(kbuf[i], &bound_ports[0]) ) + unmask_evtchn(kbuf[i]); + spin_unlock_irq(&lock); + + rc = count; + + out: + free_page((unsigned long)kbuf); + return rc; +} + +static int evtchn_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc = 0; + + spin_lock_irq(&lock); + + switch ( cmd ) + { + case EVTCHN_RESET: + __evtchn_reset_buffer_ring(); + break; + case EVTCHN_BIND: + if ( !test_and_set_bit(arg, &bound_ports[0]) ) + unmask_evtchn(arg); + else + rc = -EINVAL; + break; + case EVTCHN_UNBIND: + if ( test_and_clear_bit(arg, &bound_ports[0]) ) + mask_evtchn(arg); + else + rc = -EINVAL; + break; + default: + rc = -ENOSYS; + break; + } + + spin_unlock_irq(&lock); + + return rc; +} + +static unsigned int evtchn_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = POLLOUT | POLLWRNORM; + poll_wait(file, &evtchn_wait, wait); + if ( ring_cons != ring_prod ) + mask |= POLLIN | POLLRDNORM; + if ( ring_overflow ) + mask = POLLERR; + return mask; +} + +static int evtchn_fasync(int fd, struct file *filp, int on) +{ + return fasync_helper(fd, filp, on, &evtchn_async_queue); +} + +static int evtchn_open(struct inode *inode, struct file *filp) +{ + u16 *_ring; + + if ( test_and_set_bit(0, &evtchn_dev_inuse) ) + return -EBUSY; + + /* Allocate outside locked region so that we can use GFP_KERNEL. */ + if ( (_ring = (u16 *)get_free_page(GFP_KERNEL)) == NULL ) + return -ENOMEM; + + spin_lock_irq(&lock); + ring = _ring; + __evtchn_reset_buffer_ring(); + spin_unlock_irq(&lock); + + MOD_INC_USE_COUNT; + + return 0; +} + +static int evtchn_release(struct inode *inode, struct file *filp) +{ + int i; + + spin_lock_irq(&lock); + if ( ring != NULL ) + { + free_page((unsigned long)ring); + ring = NULL; + } + for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) + if ( test_and_clear_bit(i, &bound_ports[0]) ) + mask_evtchn(i); + spin_unlock_irq(&lock); + + evtchn_dev_inuse = 0; + + MOD_DEC_USE_COUNT; + + return 0; +} + +static struct file_operations evtchn_fops = { + owner: THIS_MODULE, + read: evtchn_read, + write: evtchn_write, + ioctl: evtchn_ioctl, + poll: evtchn_poll, + fasync: evtchn_fasync, + open: evtchn_open, + release: evtchn_release +}; + +static struct miscdevice evtchn_miscdev = { + minor: EVTCHN_MINOR, + name: "evtchn", + fops: &evtchn_fops +}; + +static int __init init_module(void) +{ + devfs_handle_t symlink_handle; + int err, pos; + char link_dest[64]; + + /* (DEVFS) create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if ( err != 0 ) + { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + /* (DEVFS) create directory '/dev/xen'. */ + xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL); + + /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */ + pos = devfs_generate_path(evtchn_miscdev.devfs_handle, + &link_dest[3], + sizeof(link_dest) - 3); + if ( pos >= 0 ) + strncpy(&link_dest[pos], "../", 3); + + /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */ + (void)devfs_mk_symlink(xen_dev_dir, + "evtchn", + DEVFS_FL_DEFAULT, + &link_dest[pos], + &symlink_handle, + NULL); + + /* (DEVFS) automatically destroy the symlink with its destination. */ + devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); + + printk("Event-channel device installed.\n"); + + return 0; +} + +static void cleanup_module(void) +{ + misc_deregister(&evtchn_miscdev); +} + +module_init(init_module); +module_exit(cleanup_module); diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/Makefile new file mode 100644 index 0000000000..20c8192d3d --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/Makefile @@ -0,0 +1,10 @@ + +O_TARGET := drv.o + +subdir-y += frontend +obj-y += frontend/drv.o + +subdir-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend +obj-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend/drv.o + +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/Makefile new file mode 100644 index 0000000000..9ffb0bd702 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := main.o control.o interface.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/common.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/common.h new file mode 100644 index 0000000000..88881cdf66 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/common.h @@ -0,0 +1,96 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/common.h + */ + +#ifndef __NETIF__BACKEND__COMMON_H__ +#define __NETIF__BACKEND__COMMON_H__ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/ip.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <asm/ctrl_if.h> +#include <asm/io.h> +#include "../netif.h" +#include "../../../../../net/bridge/br_private.h" + +#ifndef NDEBUG +#define ASSERT(_p) \ + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ + __LINE__, __FILE__); *(int*)0=0; } +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \ + __FILE__ , __LINE__ , ## _a ) +#else +#define ASSERT(_p) ((void)0) +#define DPRINTK(_f, _a...) ((void)0) +#endif + +typedef struct netif_st { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + + /* Physical parameters of the comms window. */ + unsigned long tx_shmem_frame; + unsigned long rx_shmem_frame; + unsigned int evtchn; + int irq; + + /* The shared rings and indexes. */ + netif_tx_interface_t *tx; + netif_rx_interface_t *rx; + + /* Private indexes into shared ring. */ + NETIF_RING_IDX rx_req_cons; + NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */ + NETIF_RING_IDX tx_req_cons; + NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */ + + /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ + unsigned long credit_bytes; + unsigned long credit_usec; + unsigned long remaining_credit; + struct timer_list credit_timeout; + + /* Miscellaneous private stuff. */ + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + /* + * DISCONNECT response is deferred until pending requests are ack'ed. + * We therefore need to store the id from the original request. + */ + u8 disconnect_rspid; + struct netif_st *hash_next; + struct list_head list; /* scheduling list */ + atomic_t refcnt; + spinlock_t rx_lock, tx_lock; + struct net_device *dev; + struct net_device_stats stats; +} netif_t; + +void netif_create(netif_be_create_t *create); +void netif_destroy(netif_be_destroy_t *destroy); +void netif_connect(netif_be_connect_t *connect); +int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id); +void __netif_disconnect_complete(netif_t *netif); +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle); +#define netif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define netif_put(_b) \ + do { \ + if ( atomic_dec_and_test(&(_b)->refcnt) ) \ + __netif_disconnect_complete(_b); \ + } while (0) + +void netif_interface_init(void); +void netif_ctrlif_init(void); + +void netif_deschedule(netif_t *netif); + +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); +struct net_device_stats *netif_be_get_stats(struct net_device *dev); +void netif_be_int(int irq, void *dev_id, struct pt_regs *regs); + +#endif /* __NETIF__BACKEND__COMMON_H__ */ diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/control.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/control.c new file mode 100644 index 0000000000..cf1b075031 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/control.c @@ -0,0 +1,65 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/control.c + * + * Routines for interfacing with the control plane. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" + +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_NETIF_BE_CREATE: + if ( msg->length != sizeof(netif_be_create_t) ) + goto parse_error; + netif_create((netif_be_create_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_DESTROY: + if ( msg->length != sizeof(netif_be_destroy_t) ) + goto parse_error; + netif_destroy((netif_be_destroy_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_CONNECT: + if ( msg->length != sizeof(netif_be_connect_t) ) + goto parse_error; + netif_connect((netif_be_connect_t *)&msg->msg[0]); + break; + case CMSG_NETIF_BE_DISCONNECT: + if ( msg->length != sizeof(netif_be_disconnect_t) ) + goto parse_error; + if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) ) + return; /* Sending the response is deferred until later. */ + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + DPRINTK("Parse error while reading message subtype %d, len %d\n", + msg->subtype, msg->length); + msg->length = 0; + ctrl_if_send_response(msg); +} + +void netif_ctrlif_init(void) +{ + ctrl_msg_t cmsg; + netif_be_driver_status_changed_t st; + + (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_NETIF_BE; + cmsg.subtype = CMSG_NETIF_BE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(netif_be_driver_status_changed_t); + st.status = NETIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/interface.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/interface.c new file mode 100644 index 0000000000..5a2da3d29b --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/interface.c @@ -0,0 +1,304 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/interface.c + * + * Network-device interface management. + * + * Copyright (c) 2004, Keir Fraser + */ + +#include "common.h" +#include <linux/rtnetlink.h> + +#define NETIF_HASHSZ 1024 +#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1)) + +static netif_t *netif_hash[NETIF_HASHSZ]; +static struct net_device *bridge_dev; +static struct net_bridge *bridge_br; + +netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) +{ + netif_t *netif = netif_hash[NETIF_HASH(domid, handle)]; + while ( (netif != NULL) && + ((netif->domid != domid) || (netif->handle != handle)) ) + netif = netif->hash_next; + return netif; +} + +void __netif_disconnect_complete(netif_t *netif) +{ + ctrl_msg_t cmsg; + netif_be_disconnect_t disc; + + /* + * These can't be done in __netif_disconnect() because at that point there + * may be outstanding requests at the disc whose asynchronous responses + * must still be notified to the remote driver. + */ + unbind_evtchn_from_irq(netif->evtchn); + vfree(netif->tx); /* Frees netif->rx as well. */ + rtnl_lock(); + (void)br_del_if(bridge_br, netif->dev); + (void)dev_close(netif->dev); + rtnl_unlock(); + + /* Construct the deferred response message. */ + cmsg.type = CMSG_NETIF_BE; + cmsg.subtype = CMSG_NETIF_BE_DISCONNECT; + cmsg.id = netif->disconnect_rspid; + cmsg.length = sizeof(netif_be_disconnect_t); + disc.domid = netif->domid; + disc.netif_handle = netif->handle; + disc.status = NETIF_BE_STATUS_OKAY; + memcpy(cmsg.msg, &disc, sizeof(disc)); + + /* + * Make sure message is constructed /before/ status change, because + * after the status change the 'netif' structure could be deallocated at + * any time. Also make sure we send the response /after/ status change, + * as otherwise a subsequent CONNECT request could spuriously fail if + * another CPU doesn't see the status change yet. + */ + mb(); + if ( netif->status != DISCONNECTING ) + BUG(); + netif->status = DISCONNECTED; + mb(); + + /* Send the successful response. */ + ctrl_if_send_response(&cmsg); +} + +void netif_create(netif_be_create_t *create) +{ + domid_t domid = create->domid; + unsigned int handle = create->netif_handle; + struct net_device *dev; + netif_t **pnetif, *netif; + char name[IFNAMSIZ]; + + snprintf(name, IFNAMSIZ, "vif%u.%u", domid, handle); + dev = alloc_netdev(sizeof(netif_t), name, ether_setup); + if ( dev == NULL ) + { + DPRINTK("Could not create netif: out of memory\n"); + create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + netif = dev->priv; + memset(netif, 0, sizeof(*netif)); + netif->domid = domid; + netif->handle = handle; + netif->status = DISCONNECTED; + spin_lock_init(&netif->rx_lock); + spin_lock_init(&netif->tx_lock); + atomic_set(&netif->refcnt, 0); + netif->dev = dev; + + netif->credit_bytes = netif->remaining_credit = ~0UL; + netif->credit_usec = 0UL; + /*init_ac_timer(&new_vif->credit_timeout);*/ + + pnetif = &netif_hash[NETIF_HASH(domid, handle)]; + while ( *pnetif != NULL ) + { + if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) ) + { + DPRINTK("Could not create netif: already exists\n"); + create->status = NETIF_BE_STATUS_INTERFACE_EXISTS; + kfree(dev); + return; + } + pnetif = &(*pnetif)->hash_next; + } + + dev->hard_start_xmit = netif_be_start_xmit; + dev->get_stats = netif_be_get_stats; + memcpy(dev->dev_addr, create->mac, ETH_ALEN); + + /* Disable queuing. */ + dev->tx_queue_len = 0; + + /* XXX In bridge mode we should force a different MAC from remote end. */ + dev->dev_addr[2] ^= 1; + + if ( register_netdev(dev) != 0 ) + { + DPRINTK("Could not register new net device\n"); + create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + kfree(dev); + return; + } + + netif->hash_next = *pnetif; + *pnetif = netif; + + DPRINTK("Successfully created netif\n"); + create->status = NETIF_BE_STATUS_OKAY; +} + +void netif_destroy(netif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->netif_handle; + netif_t **pnetif, *netif; + + pnetif = &netif_hash[NETIF_HASH(domid, handle)]; + while ( (netif = *pnetif) != NULL ) + { + if ( (netif->domid == domid) && (netif->handle == handle) ) + { + if ( netif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pnetif = &netif->hash_next; + } + + destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pnetif = netif->hash_next; + unregister_netdev(netif->dev); + kfree(netif->dev); + destroy->status = NETIF_BE_STATUS_OKAY; +} + +void netif_connect(netif_be_connect_t *connect) +{ + domid_t domid = connect->domid; + unsigned int handle = connect->netif_handle; + unsigned int evtchn = connect->evtchn; + unsigned long tx_shmem_frame = connect->tx_shmem_frame; + unsigned long rx_shmem_frame = connect->rx_shmem_frame; + struct vm_struct *vma; + pgprot_t prot; + int error; + netif_t *netif; + struct net_device *eth0_dev; + + netif = netif_find_by_handle(domid, handle); + if ( unlikely(netif == NULL) ) + { + DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", + connect->domid, connect->netif_handle); + connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + if ( netif->status != DISCONNECTED ) + { + connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; + return; + } + + if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL ) + { + connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); + error = direct_remap_area_pages(&init_mm, + VMALLOC_VMADDR(vma->addr), + tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, + prot, domid); + error |= direct_remap_area_pages(&init_mm, + VMALLOC_VMADDR(vma->addr) + PAGE_SIZE, + rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, + prot, domid); + if ( error != 0 ) + { + if ( error == -ENOMEM ) + connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; + else if ( error == -EFAULT ) + connect->status = NETIF_BE_STATUS_MAPPING_ERROR; + else + connect->status = NETIF_BE_STATUS_ERROR; + vfree(vma->addr); + return; + } + + netif->evtchn = evtchn; + netif->irq = bind_evtchn_to_irq(evtchn); + netif->tx_shmem_frame = tx_shmem_frame; + netif->rx_shmem_frame = rx_shmem_frame; + netif->tx = + (netif_tx_interface_t *)vma->addr; + netif->rx = + (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); + netif->status = CONNECTED; + netif_get(netif); + + rtnl_lock(); + + (void)dev_open(netif->dev); + (void)br_add_if(bridge_br, netif->dev); + + /* + * The default config is a very simple binding to eth0. + * If eth0 is being used as an IP interface by this OS then someone + * must add eth0's IP address to nbe-br, and change the routing table + * to refer to nbe-br instead of eth0. + */ + (void)dev_open(bridge_dev); + if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL ) + { + (void)dev_open(eth0_dev); + (void)br_add_if(bridge_br, eth0_dev); + } + + rtnl_unlock(); + + (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif); + netif_start_queue(netif->dev); + + connect->status = NETIF_BE_STATUS_OKAY; +} + +int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id) +{ + domid_t domid = disconnect->domid; + unsigned int handle = disconnect->netif_handle; + netif_t *netif; + + netif = netif_find_by_handle(domid, handle); + if ( unlikely(netif == NULL) ) + { + DPRINTK("netif_disconnect attempted for non-existent netif" + " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); + disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; + return 1; /* Caller will send response error message. */ + } + + if ( netif->status == CONNECTED ) + { + netif->status = DISCONNECTING; + netif->disconnect_rspid = rsp_id; + wmb(); /* Let other CPUs see the status change. */ + netif_stop_queue(netif->dev); + free_irq(netif->irq, NULL); + netif_deschedule(netif); + netif_put(netif); + } + + return 0; /* Caller should not send response message. */ +} + +void netif_interface_init(void) +{ + memset(netif_hash, 0, sizeof(netif_hash)); + if ( br_add_bridge("nbe-br") != 0 ) + BUG(); + bridge_dev = __dev_get_by_name("nbe-br"); + bridge_br = (struct net_bridge *)bridge_dev->priv; + bridge_br->bridge_hello_time = bridge_br->hello_time = 0; + bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0; + bridge_br->stp_enabled = 0; +} diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/main.c new file mode 100644 index 0000000000..2c4e5cb211 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/backend/main.c @@ -0,0 +1,772 @@ +/****************************************************************************** + * arch/xen/drivers/netif/backend/main.c + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * arch/xen/drivers/netif/frontend + * + * Copyright (c) 2002-2004, K A Fraser + */ + +#include "common.h" + +static void netif_page_release(struct page *page); +static void make_tx_response(netif_t *netif, + u16 id, + s8 st); +static int make_rx_response(netif_t *netif, + u16 id, + s8 st, + memory_t addr, + u16 size); + +static void net_tx_action(unsigned long unused); +static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); + +static void net_rx_action(unsigned long unused); +static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); + +typedef struct { + u16 id; + unsigned long old_mach_ptr; + unsigned long new_mach_pfn; + netif_t *netif; +} rx_info_t; +static struct sk_buff_head rx_queue; +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2]; +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3]; +static unsigned char rx_notify[NR_EVENT_CHANNELS]; + +/* Don't currently gate addition of an interface to the tx scheduling list. */ +#define tx_work_exists(_if) (1) + +#define MAX_PENDING_REQS 256 +static unsigned long mmap_vstart; +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) + +#define PKT_PROT_LEN (ETH_HLEN + 20) + +static u16 pending_id[MAX_PENDING_REQS]; +static netif_t *pending_netif[MAX_PENDING_REQS]; +static u16 pending_ring[MAX_PENDING_REQS]; +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +/* Freed TX SKBs get batched on this ring before return to pending_ring. */ +static u16 dealloc_ring[MAX_PENDING_REQS]; +static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED; +static PEND_RING_IDX dealloc_prod, dealloc_cons; + +typedef struct { + u16 idx; + netif_tx_request_t req; + netif_t *netif; +} tx_info_t; +static struct sk_buff_head tx_queue; +static multicall_entry_t tx_mcl[MAX_PENDING_REQS]; + +static struct list_head net_schedule_list; +static spinlock_t net_schedule_list_lock; + +#define MAX_MFN_ALLOC 64 +static unsigned long mfn_list[MAX_MFN_ALLOC]; +static unsigned int alloc_index = 0; +static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED; + +static void __refresh_mfn_list(void) +{ + int ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation, + mfn_list, MAX_MFN_ALLOC); + if ( unlikely(ret != MAX_MFN_ALLOC) ) + { + printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret); + BUG(); + } + alloc_index = MAX_MFN_ALLOC; +} + +static unsigned long get_new_mfn(void) +{ + unsigned long mfn, flags; + spin_lock_irqsave(&mfn_lock, flags); + if ( alloc_index == 0 ) + __refresh_mfn_list(); + mfn = mfn_list[--alloc_index]; + spin_unlock_irqrestore(&mfn_lock, flags); + return mfn; +} + +static void dealloc_mfn(unsigned long mfn) +{ + unsigned long flags; + spin_lock_irqsave(&mfn_lock, flags); + if ( alloc_index != MAX_MFN_ALLOC ) + mfn_list[alloc_index++] = mfn; + else + (void)HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, &mfn, 1); + spin_unlock_irqrestore(&mfn_lock, flags); +} + +static inline void maybe_schedule_tx_action(void) +{ + smp_mb(); + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && + !list_empty(&net_schedule_list) ) + tasklet_schedule(&net_tx_tasklet); +} + +/* + * This is the primary RECEIVE function for a network interface. + * Note that, from the p.o.v. of /this/ OS it looks like a transmit. + */ +int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + netif_t *netif = (netif_t *)dev->priv; + + /* Drop the packet if the target domain has no receive buffers. */ + if ( (netif->rx_req_cons == netif->rx->req_prod) || + ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) ) + goto drop; + + /* + * We do not copy the packet unless: + * 1. The data is shared; or + * 2. It spans a page boundary; or + * 3. We cannot be sure the whole data page is allocated. + * The copying method is taken from skb_copy(). + * NB. We also couldn't cope with fragmented packets, but we won't get + * any because we not advertise the NETIF_F_SG feature. + */ + if ( skb_shared(skb) || skb_cloned(skb) || + (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) || + ((skb->end - skb->head) < (PAGE_SIZE/2)) ) + { + struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC); + int hlen = skb->data - skb->head; + if ( unlikely(nskb == NULL) ) + goto drop; + skb_reserve(nskb, hlen); + __skb_put(nskb, skb->len); + (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len); + dev_kfree_skb(skb); + skb = nskb; + } + + ((rx_info_t *)&skb->cb[0])->id = + netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_req_cons++)].req.id; + ((rx_info_t *)&skb->cb[0])->netif = netif; + + __skb_queue_tail(&rx_queue, skb); + tasklet_schedule(&net_rx_tasklet); + + return 0; + + drop: + netif->stats.rx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +#if 0 +static void xen_network_done_notify(void) +{ + static struct net_device *eth0_dev = NULL; + if ( unlikely(eth0_dev == NULL) ) + eth0_dev = __dev_get_by_name("eth0"); + netif_rx_schedule(eth0_dev); +} +/* + * Add following to poll() function in NAPI driver (Tigon3 is example): + * if ( xen_network_done() ) + * tg3_enable_ints(tp); + */ +int xen_network_done(void) +{ + return skb_queue_empty(&rx_queue); +} +#endif + +static void net_rx_action(unsigned long unused) +{ + netif_t *netif; + s8 status; + u16 size, id, evtchn; + mmu_update_t *mmu = rx_mmu; + multicall_entry_t *mcl; + unsigned long vdata, mdata, new_mfn; + struct sk_buff_head rxq; + struct sk_buff *skb; + u16 notify_list[NETIF_RX_RING_SIZE]; + int notify_nr = 0; + + skb_queue_head_init(&rxq); + + mcl = rx_mcl; + while ( (skb = __skb_dequeue(&rx_queue)) != NULL ) + { + netif = ((rx_info_t *)&skb->cb[0])->netif; + vdata = (unsigned long)skb->data; + mdata = virt_to_machine(vdata); + new_mfn = get_new_mfn(); + + mmu[0].ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu[0].val = __pa(vdata) >> PAGE_SHIFT; + mmu[1].val = (unsigned long)(netif->domid<<16) & ~0xFFFFUL; + mmu[1].ptr = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL; + mmu[1].ptr |= MMU_EXTENDED_COMMAND; + mmu[1].val |= MMUEXT_SET_SUBJECTDOM; + mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; + mmu[2].val = MMUEXT_REASSIGN_PAGE; + + mcl[0].op = __HYPERVISOR_update_va_mapping; + mcl[0].args[0] = vdata >> PAGE_SHIFT; + mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL; + mcl[0].args[2] = 0; + mcl[1].op = __HYPERVISOR_mmu_update; + mcl[1].args[0] = (unsigned long)mmu; + mcl[1].args[1] = 3; + mcl[1].args[2] = 0; + + mmu += 3; + mcl += 2; + + ((rx_info_t *)&skb->cb[0])->old_mach_ptr = mdata; + ((rx_info_t *)&skb->cb[0])->new_mach_pfn = new_mfn; + __skb_queue_tail(&rxq, skb); + + /* Filled the batch queue? */ + if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) ) + break; + } + + if ( mcl == rx_mcl ) + return; + + mcl[-2].args[2] = UVMF_FLUSH_TLB; + (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); + + mcl = rx_mcl; + while ( (skb = __skb_dequeue(&rxq)) != NULL ) + { + netif = ((rx_info_t *)&skb->cb[0])->netif; + size = skb->tail - skb->data; + id = ((rx_info_t *)&skb->cb[0])->id; + new_mfn = ((rx_info_t *)&skb->cb[0])->new_mach_pfn; + mdata = ((rx_info_t *)&skb->cb[0])->old_mach_ptr; + + /* Check the reassignment error code. */ + if ( unlikely(mcl[1].args[5] != 0) ) + { + DPRINTK("Failed MMU update transferring to DOM%u\n", + netif->domid); + (void)HYPERVISOR_update_va_mapping( + (unsigned long)skb->head >> PAGE_SHIFT, + (pte_t) { (mdata & PAGE_MASK) | __PAGE_KERNEL }, + UVMF_INVLPG); + dealloc_mfn(new_mfn); + status = NETIF_RSP_ERROR; + } + else + { + phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; + + netif->stats.rx_bytes += size; + netif->stats.rx_packets++; + + status = NETIF_RSP_OKAY; + } + + evtchn = netif->evtchn; + if ( make_rx_response(netif, id, status, mdata, size) && + (rx_notify[evtchn] == 0) ) + { + rx_notify[evtchn] = 1; + notify_list[notify_nr++] = evtchn; + } + + dev_kfree_skb(skb); + + mcl += 2; + } + + while ( notify_nr != 0 ) + { + evtchn = notify_list[--notify_nr]; + rx_notify[evtchn] = 0; + notify_via_evtchn(evtchn); + } + + /* More work to do? */ + if ( !skb_queue_empty(&rx_queue) ) + tasklet_schedule(&net_rx_tasklet); +#if 0 + else + xen_network_done_notify(); +#endif +} + +struct net_device_stats *netif_be_get_stats(struct net_device *dev) +{ + netif_t *netif = dev->priv; + return &netif->stats; +} + +static int __on_net_schedule_list(netif_t *netif) +{ + return netif->list.next != NULL; +} + +static void remove_from_net_schedule_list(netif_t *netif) +{ + spin_lock_irq(&net_schedule_list_lock); + if ( likely(__on_net_schedule_list(netif)) ) + { + list_del(&netif->list); + netif->list.next = NULL; + netif_put(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +static void add_to_net_schedule_list_tail(netif_t *netif) +{ + if ( __on_net_schedule_list(netif) ) + return; + + spin_lock_irq(&net_schedule_list_lock); + if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) ) + { + list_add_tail(&netif->list, &net_schedule_list); + netif_get(netif); + } + spin_unlock_irq(&net_schedule_list_lock); +} + +static inline void netif_schedule_work(netif_t *netif) +{ + if ( (netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) + { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + +void netif_deschedule(netif_t *netif) +{ + remove_from_net_schedule_list(netif); +} + +#if 0 +static void tx_credit_callback(unsigned long data) +{ + netif_t *netif = (netif_t *)data; + netif->remaining_credit = netif->credit_bytes; + netif_schedule_work(netif); +} +#endif + +static void net_tx_action(unsigned long unused) +{ + struct list_head *ent; + struct sk_buff *skb; + netif_t *netif; + netif_tx_request_t txreq; + u16 pending_idx; + NETIF_RING_IDX i; + struct page *page; + multicall_entry_t *mcl; + + if ( (i = dealloc_cons) == dealloc_prod ) + goto skip_dealloc; + + mcl = tx_mcl; + while ( i != dealloc_prod ) + { + pending_idx = dealloc_ring[MASK_PEND_IDX(i++)]; + mcl[0].op = __HYPERVISOR_update_va_mapping; + mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[1] = 0; + mcl[0].args[2] = 0; + mcl++; + } + + mcl[-1].args[2] = UVMF_FLUSH_TLB; + (void)HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl); + + while ( dealloc_cons != dealloc_prod ) + { + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; + + netif = pending_netif[pending_idx]; + + spin_lock(&netif->tx_lock); + make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY); + spin_unlock(&netif->tx_lock); + + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + /* + * Scheduling checks must happen after the above response is posted. + * This avoids a possible race with a guest OS on another CPU. + */ + mb(); + if ( (netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) + add_to_net_schedule_list_tail(netif); + + netif_put(netif); + } + + skip_dealloc: + mcl = tx_mcl; + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && + !list_empty(&net_schedule_list) ) + { + /* Get a netif from the list with work to do. */ + ent = net_schedule_list.next; + netif = list_entry(ent, netif_t, list); + netif_get(netif); + remove_from_net_schedule_list(netif); + + /* Work to do? */ + i = netif->tx_req_cons; + if ( (i == netif->tx->req_prod) || + ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) + { + netif_put(netif); + continue; + } + memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, + sizeof(txreq)); + netif->tx_req_cons++; + +#if 0 + /* Credit-based scheduling. */ + if ( tx.size > netif->remaining_credit ) + { + s_time_t now = NOW(), next_credit = + netif->credit_timeout.expires + MICROSECS(netif->credit_usec); + if ( next_credit <= now ) + { + netif->credit_timeout.expires = now; + netif->remaining_credit = netif->credit_bytes; + } + else + { + netif->remaining_credit = 0; + netif->credit_timeout.expires = next_credit; + netif->credit_timeout.data = (unsigned long)netif; + netif->credit_timeout.function = tx_credit_callback; + netif->credit_timeout.cpu = smp_processor_id(); + add_ac_timer(&netif->credit_timeout); + break; + } + } + netif->remaining_credit -= tx.size; +#endif + + netif_schedule_work(netif); + + if ( unlikely(txreq.size <= PKT_PROT_LEN) || + unlikely(txreq.size > ETH_FRAME_LEN) ) + { + DPRINTK("Bad packet size: %d\n", txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + /* No crossing a page boundary as the payload mustn't fragment. */ + if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) + { + DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", + txreq.addr, txreq.size, + (txreq.addr &~PAGE_MASK) + txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) ) + { + DPRINTK("Can't allocate a skb in start_xmit.\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + + mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain; + mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT; + mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL; + mcl[0].args[2] = 0; + mcl[0].args[3] = netif->domid; + mcl++; + + ((tx_info_t *)&skb->cb[0])->idx = pending_idx; + ((tx_info_t *)&skb->cb[0])->netif = netif; + memcpy(&((tx_info_t *)&skb->cb[0])->req, &txreq, sizeof(txreq)); + __skb_queue_tail(&tx_queue, skb); + + pending_cons++; + + /* Filled the batch queue? */ + if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) ) + break; + } + + if ( mcl == tx_mcl ) + return; + + (void)HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl); + + mcl = tx_mcl; + while ( (skb = __skb_dequeue(&tx_queue)) != NULL ) + { + pending_idx = ((tx_info_t *)&skb->cb[0])->idx; + netif = ((tx_info_t *)&skb->cb[0])->netif; + memcpy(&txreq, &((tx_info_t *)&skb->cb[0])->req, sizeof(txreq)); + + /* Check the remap error code. */ + if ( unlikely(mcl[0].args[5] != 0) ) + { + DPRINTK("Bad page frame\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + kfree_skb(skb); + mcl++; + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + continue; + } + + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = + txreq.addr >> PAGE_SHIFT; + + __skb_put(skb, PKT_PROT_LEN); + memcpy(skb->data, + (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), + PKT_PROT_LEN); + + page = virt_to_page(MMAP_VADDR(pending_idx)); + + /* Append the packet payload as a fragment. */ + skb_shinfo(skb)->frags[0].page = page; + skb_shinfo(skb)->frags[0].size = txreq.size - PKT_PROT_LEN; + skb_shinfo(skb)->frags[0].page_offset = + (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK; + skb_shinfo(skb)->nr_frags = 1; + skb->data_len = txreq.size - PKT_PROT_LEN; + skb->len += skb->data_len; + + skb->dev = netif->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + /* + * Destructor information. We hideously abuse the 'mapping' pointer, + * which isn't otherwise used by us. The page deallocator is modified + * to interpret a non-NULL value as a destructor function to be called. + * This works okay because in all other cases the pointer must be NULL + * when the page is freed (normally Linux will explicitly bug out if + * it sees otherwise. + */ + page->mapping = (struct address_space *)netif_page_release; + atomic_set(&page->count, 1); + pending_id[pending_idx] = txreq.id; + pending_netif[pending_idx] = netif; + + netif->stats.tx_bytes += txreq.size; + netif->stats.tx_packets++; + + netif_rx(skb); + netif->dev->last_rx = jiffies; + + mcl++; + } +} + +static void netif_page_release(struct page *page) +{ + unsigned long flags; + u16 pending_idx = page - virt_to_page(mmap_vstart); + + /* Stop the abuse. */ + page->mapping = NULL; + + spin_lock_irqsave(&dealloc_lock, flags); + dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; + spin_unlock_irqrestore(&dealloc_lock, flags); + + tasklet_schedule(&net_tx_tasklet); +} + +#if 0 +long flush_bufs_for_netif(netif_t *netif) +{ + NET_RING_IDX i; + + /* Return any outstanding receive buffers to the guest OS. */ + spin_lock(&netif->rx_lock); + for ( i = netif->rx_req_cons; + (i != netif->rx->req_prod) && + ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE); + i++ ) + { + make_rx_response(netif, + netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id, + NETIF_RSP_DROPPED, 0, 0); + } + netif->rx_req_cons = i; + spin_unlock(&netif->rx_lock); + + /* + * Flush pending transmit buffers. The guest may still have to wait for + * buffers that are queued at a physical NIC. + */ + spin_lock(&netif->tx_lock); + for ( i = netif->tx_req_cons; + (i != netif->tx->req_prod) && + ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE); + i++ ) + { + make_tx_response(netif, + netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id, + NETIF_RSP_DROPPED); + } + netif->tx_req_cons = i; + spin_unlock(&netif->tx_lock); + + return 0; +} +#endif + +void netif_be_int(int irq, void *dev_id, struct pt_regs *regs) +{ + netif_t *netif = dev_id; + if ( tx_work_exists(netif) ) + { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + +static void make_tx_response(netif_t *netif, + u16 id, + s8 st) +{ + NET_RING_IDX i = netif->tx_resp_prod; + netif_tx_response_t *resp; + + resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp; + resp->id = id; + resp->status = st; + wmb(); + netif->tx->resp_prod = netif->tx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + if ( i == netif->tx->event ) + notify_via_evtchn(netif->evtchn); +} + +static int make_rx_response(netif_t *netif, + u16 id, + s8 st, + memory_t addr, + u16 size) +{ + NET_RING_IDX i = netif->rx_resp_prod; + netif_rx_response_t *resp; + + resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp; + resp->addr = addr; + resp->id = id; + resp->status = (s16)size; + if ( st < 0 ) + resp->status = (s16)st; + wmb(); + netif->rx->resp_prod = netif->rx_resp_prod = ++i; + + mb(); /* Update producer before checking event threshold. */ + return (i == netif->rx->event); +} + +static void netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) +{ + struct list_head *ent; + netif_t *netif; + int i = 0; + + printk(KERN_ALERT "netif_schedule_list:\n"); + spin_lock_irq(&net_schedule_list_lock); + + list_for_each ( ent, &net_schedule_list ) + { + netif = list_entry(ent, netif_t, list); + printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n", + i, netif->rx_req_cons, netif->rx_resp_prod); + printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", + netif->tx_req_cons, netif->tx_resp_prod); + printk(KERN_ALERT " shared(rx_req_prod=%08x rx_resp_prod=%08x\n", + netif->rx->req_prod, netif->rx->resp_prod); + printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", + netif->rx->event, netif->tx->req_prod); + printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", + netif->tx->resp_prod, netif->tx->event); + i++; + } + + spin_unlock_irq(&net_schedule_list_lock); + printk(KERN_ALERT " ** End of netif_schedule_list **\n"); +} + +static int __init init_module(void) +{ + int i; + + if ( !(start_info.flags & SIF_NET_BE_DOMAIN) && + !(start_info.flags & SIF_INITDOMAIN) ) + return 0; + + printk("Initialising Xen netif backend\n"); + + skb_queue_head_init(&rx_queue); + skb_queue_head_init(&tx_queue); + + netif_interface_init(); + + if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 ) + BUG(); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + spin_lock_init(&net_schedule_list_lock); + INIT_LIST_HEAD(&net_schedule_list); + + netif_ctrlif_init(); + + (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), + netif_be_dbg, SA_SHIRQ, + "net-be-dbg", &netif_be_dbg); + + return 0; +} + +static void cleanup_module(void) +{ + BUG(); +} + +module_init(init_module); +module_exit(cleanup_module); diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/Makefile new file mode 100644 index 0000000000..032d02d7cc --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := main.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/main.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/main.c new file mode 100644 index 0000000000..4d4c579703 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/frontend/main.c @@ -0,0 +1,777 @@ +/****************************************************************************** + * arch/xen/drivers/netif/frontend/main.c + * + * Virtual network driver for XenoLinux. + * + * Copyright (c) 2002-2004, K A Fraser + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> + +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/init.h> + +#include <asm/io.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#include <asm/evtchn.h> +#include <asm/ctrl_if.h> + +#include <asm/page.h> + +#include "../netif.h" + +#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */ + +static void network_tx_buf_gc(struct net_device *dev); +static void network_alloc_rx_buffers(struct net_device *dev); +static void cleanup_module(void); + +static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE]; +static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1]; +static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; + +static struct list_head dev_list; + +struct net_private +{ + struct list_head list; + struct net_device *dev; + + struct net_device_stats stats; + NETIF_RING_IDX rx_resp_cons, tx_resp_cons; + unsigned int tx_full; + + netif_tx_interface_t *tx; + netif_rx_interface_t *rx; + + spinlock_t tx_lock; + spinlock_t rx_lock; + + unsigned int handle; + unsigned int evtchn; + unsigned int irq; + +#define NETIF_STATE_CLOSED 0 +#define NETIF_STATE_DISCONNECTED 1 +#define NETIF_STATE_CONNECTED 2 +#define NETIF_STATE_ACTIVE 3 + unsigned int state; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in each + * array is an index into a chain of free entries. + */ + struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; + struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; +}; + +/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ +#define ADD_ID_TO_FREELIST(_list, _id) \ + (_list)[(_id)] = (_list)[0]; \ + (_list)[0] = (void *)(unsigned long)(_id); +#define GET_ID_FROM_FREELIST(_list) \ + ({ unsigned long _id = (unsigned long)(_list)[0]; \ + (_list)[0] = (_list)[_id]; \ + (unsigned short)_id; }) + +static struct net_device *find_dev_by_handle(unsigned int handle) +{ + struct list_head *ent; + struct net_private *np; + list_for_each ( ent, &dev_list ) + { + np = list_entry(ent, struct net_private, list); + if ( np->handle == handle ) + return np->dev; + } + return NULL; +} + + +static int network_open(struct net_device *dev) +{ + struct net_private *np = dev->priv; + int i; + + if ( np->state != NETIF_STATE_CONNECTED ) + return -EINVAL; + + np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; + memset(&np->stats, 0, sizeof(np->stats)); + spin_lock_init(&np->tx_lock); + spin_lock_init(&np->rx_lock); + + /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ + for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ ) + np->tx_skbs[i] = (void *)(i+1); + for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ ) + np->rx_skbs[i] = (void *)(i+1); + + wmb(); + np->state = NETIF_STATE_ACTIVE; + + network_alloc_rx_buffers(dev); + np->rx->event = np->rx_resp_cons + 1; + + netif_start_queue(dev); + + MOD_INC_USE_COUNT; + + return 0; +} + + +static void network_tx_buf_gc(struct net_device *dev) +{ + NETIF_RING_IDX i, prod; + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + + do { + prod = np->tx->resp_prod; + + for ( i = np->tx_resp_cons; i != prod; i++ ) + { + id = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id; + skb = np->tx_skbs[id]; + ADD_ID_TO_FREELIST(np->tx_skbs, id); + dev_kfree_skb_any(skb); + } + + np->tx_resp_cons = prod; + + /* + * Set a new event, then check for race with update of tx_cons. Note + * that it is essential to schedule a callback, no matter how few + * buffers are pending. Even if there is space in the transmit ring, + * higher layers may be blocked because too much data is outstanding: + * in such cases notification from Xen is likely to be the only kick + * that we'll get. + */ + np->tx->event = + prod + ((np->tx->req_prod - prod) >> 1) + 1; + mb(); + } + while ( prod != np->tx->resp_prod ); + + if ( np->tx_full && + ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) ) + { + np->tx_full = 0; + if ( np->state == NETIF_STATE_ACTIVE ) + netif_wake_queue(dev); + } +} + + +static void network_alloc_rx_buffers(struct net_device *dev) +{ + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + NETIF_RING_IDX i = np->rx->req_prod; + int nr_pfns = 0; + + /* Make sure the batch is large enough to be worthwhile (1/2 ring). */ + if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || + unlikely(np->state != NETIF_STATE_ACTIVE) ) + return; + + do { + skb = dev_alloc_skb(RX_BUF_SIZE); + if ( unlikely(skb == NULL) ) + break; + + skb->dev = dev; + + if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) ) + panic("alloc_skb needs to provide us page-aligned buffers."); + + id = GET_ID_FROM_FREELIST(np->rx_skbs); + + np->rx_skbs[id] = skb; + + np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id; + + rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT; + + rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping; + rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT; + rx_mcl[nr_pfns].args[1] = 0; + rx_mcl[nr_pfns].args[2] = 0; + + nr_pfns++; + } + while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE ); + + /* + * We may have allocated buffers which have entries outstanding in the page + * update queue -- make sure we flush those first! + */ + flush_page_update_queue(); + + /* After all PTEs have been zapped we blow away stale TLB entries. */ + rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB; + + /* Give away a batch of pages. */ + rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op; + rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation; + rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array; + rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns; + + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1); + + /* Check return status of HYPERVISOR_dom_mem_op(). */ + if ( rx_mcl[nr_pfns].args[5] != nr_pfns ) + panic("Unable to reduce memory reservation\n"); + + np->rx->req_prod = i; +} + + +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned short id; + struct net_private *np = (struct net_private *)dev->priv; + netif_tx_request_t *tx; + NETIF_RING_IDX i; + + if ( unlikely(np->tx_full) ) + { + printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); + netif_stop_queue(dev); + return -ENOBUFS; + } + + if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= + PAGE_SIZE) ) + { + struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE); + if ( unlikely(new_skb == NULL) ) + return 1; + skb_put(new_skb, skb->len); + memcpy(new_skb->data, skb->data, skb->len); + dev_kfree_skb(skb); + skb = new_skb; + } + + spin_lock_irq(&np->tx_lock); + + /* if the backend isn't available then don't do anything! */ + if ( !netif_carrier_ok(dev) ) + { + spin_unlock_irq(&np->tx_lock); + return 1; + } + + i = np->tx->req_prod; + + id = GET_ID_FROM_FREELIST(np->tx_skbs); + np->tx_skbs[id] = skb; + + tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req; + + tx->id = id; + tx->addr = virt_to_machine(skb->data); + tx->size = skb->len; + + wmb(); + np->tx->req_prod = i + 1; + + network_tx_buf_gc(dev); + + if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) ) + { + np->tx_full = 1; + netif_stop_queue(dev); + } + + spin_unlock_irq(&np->tx_lock); + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + + /* Only notify Xen if there are no outstanding responses. */ + mb(); + if ( np->tx->resp_prod == i ) + notify_via_evtchn(np->evtchn); + + return 0; +} + + +static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs) +{ + struct net_device *dev = dev_id; + struct net_private *np = dev->priv; + unsigned long flags; + + spin_lock_irqsave(&np->tx_lock, flags); + + if( !netif_carrier_ok(dev) ) + { + spin_unlock_irqrestore(&np->tx_lock, flags); + return; + } + + network_tx_buf_gc(dev); + spin_unlock_irqrestore(&np->tx_lock, flags); + + if ( np->rx_resp_cons != np->rx->resp_prod ) + netif_rx_schedule(dev); +} + + +static int netif_poll(struct net_device *dev, int *pbudget) +{ + struct net_private *np = dev->priv; + struct sk_buff *skb; + netif_rx_response_t *rx; + NETIF_RING_IDX i; + mmu_update_t *mmu = rx_mmu; + multicall_entry_t *mcl = rx_mcl; + int work_done, budget, more_to_do = 1; + struct sk_buff_head rxq; + unsigned long flags; + + spin_lock(&np->rx_lock); + + /* if the device is undergoing recovery then don't do anything */ + if ( !netif_carrier_ok(dev) ) + { + spin_unlock(&np->rx_lock); + return 0; + } + + skb_queue_head_init(&rxq); + + if ( (budget = *pbudget) > dev->quota ) + budget = dev->quota; + + for ( i = np->rx_resp_cons, work_done = 0; + (i != np->rx->resp_prod) && (work_done < budget); + i++, work_done++ ) + { + rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp; + + skb = np->rx_skbs[rx->id]; + ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); + + if ( unlikely(rx->status <= 0) ) + { + /* Gate this error. We get a (valid) slew of them on suspend. */ + if ( np->state == NETIF_STATE_ACTIVE ) + printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status); + dev_kfree_skb(skb); + continue; + } + + skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK); + skb_put(skb, rx->status); + + np->stats.rx_packets++; + np->stats.rx_bytes += rx->status; + + /* Remap the page. */ + mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; + mmu->val = __pa(skb->head) >> PAGE_SHIFT; + mmu++; + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT; + mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; + mcl->args[2] = 0; + mcl++; + + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = + rx->addr >> PAGE_SHIFT; + + __skb_queue_tail(&rxq, skb); + } + + /* Do all the remapping work, and M->P updates, in one big hypercall. */ + if ( likely((mcl - rx_mcl) != 0) ) + { + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = mmu - rx_mmu; + mcl->args[2] = 0; + mcl++; + (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); + } + + while ( (skb = __skb_dequeue(&rxq)) != NULL ) + { + /* Set the shared-info area, which is hidden behind the real data. */ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; + + /* Ethernet-specific work. Delayed to here as it peeks the header. */ + skb->protocol = eth_type_trans(skb, dev); + + /* Pass it up. */ + netif_rx(skb); + dev->last_rx = jiffies; + } + + np->rx_resp_cons = i; + + network_alloc_rx_buffers(dev); + + *pbudget -= work_done; + dev->quota -= work_done; + + if ( work_done < budget ) + { + local_irq_save(flags); + + np->rx->event = i + 1; + + /* Deal with hypervisor racing our resetting of rx_event. */ + mb(); + if ( np->rx->resp_prod == i ) + { + __netif_rx_complete(dev); + more_to_do = 0; + } + + local_irq_restore(flags); + } + + spin_unlock(&np->rx_lock); + + return more_to_do; +} + + +static int network_close(struct net_device *dev) +{ + struct net_private *np = dev->priv; + + netif_stop_queue(np->dev); + + np->state = NETIF_STATE_CONNECTED; + + /* XXX We need to properly disconnect via the domain controller. */ + while ( /*(np->rx_resp_cons != np->rx->req_prod) ||*/ + (np->tx_resp_cons != np->tx->req_prod) ) + { + barrier(); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static struct net_device_stats *network_get_stats(struct net_device *dev) +{ + struct net_private *np = (struct net_private *)dev->priv; + return &np->stats; +} + + +static void netif_status_change(netif_fe_interface_status_changed_t *status) +{ + ctrl_msg_t cmsg; + netif_fe_interface_connect_t up; + struct net_device *dev; + struct net_private *np; + + if ( status->handle != 0 ) + { + printk(KERN_WARNING "Status change on unsupported netif %d\n", + status->handle); + return; + } + + dev = find_dev_by_handle(0); + np = dev->priv; + + switch ( status->status ) + { + case NETIF_INTERFACE_STATUS_DESTROYED: + printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n", + np->state); + break; + + case NETIF_INTERFACE_STATUS_DISCONNECTED: + if ( np->state != NETIF_STATE_CLOSED ) + { + printk(KERN_WARNING "Unexpected netif-DISCONNECTED message" + " in state %d\n", np->state); + printk(KERN_INFO "Attempting to reconnect network interface\n"); + + /* Begin interface recovery. + * + * NB. Whilst we're recovering, we turn the carrier state off. We + * take measures to ensure that this device isn't used for + * anything. We also stop the queue for this device. Various + * different approaches (e.g. continuing to buffer packets) have + * been tested but don't appear to improve the overall impact on + * TCP connections. + * + * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery + * is initiated by a special "RESET" message - disconnect could + * just mean we're not allowed to use this interface any more. + */ + + /* Stop old i/f to prevent errors whilst we rebuild the state. */ + spin_lock_irq(&np->tx_lock); + spin_lock(&np->rx_lock); + netif_stop_queue(dev); + netif_carrier_off(dev); + np->state = NETIF_STATE_DISCONNECTED; + spin_unlock(&np->rx_lock); + spin_unlock_irq(&np->tx_lock); + + /* Free resources. */ + free_irq(np->irq, dev); + unbind_evtchn_from_irq(np->evtchn); + free_page((unsigned long)np->tx); + free_page((unsigned long)np->rx); + } + + /* Move from CLOSED to DISCONNECTED state. */ + np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); + np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); + memset(np->tx, 0, PAGE_SIZE); + memset(np->rx, 0, PAGE_SIZE); + np->state = NETIF_STATE_DISCONNECTED; + + /* Construct an interface-CONNECT message for the domain controller. */ + cmsg.type = CMSG_NETIF_FE; + cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT; + cmsg.length = sizeof(netif_fe_interface_connect_t); + up.handle = 0; + up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT; + up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT; + memcpy(cmsg.msg, &up, sizeof(up)); + + /* Tell the controller to bring up the interface. */ + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + break; + + case NETIF_INTERFACE_STATUS_CONNECTED: + if ( np->state == NETIF_STATE_CLOSED ) + { + printk(KERN_WARNING "Unexpected netif-CONNECTED message" + " in state %d\n", np->state); + break; + } + + memcpy(dev->dev_addr, status->mac, ETH_ALEN); + + if(netif_carrier_ok(dev)) + np->state = NETIF_STATE_CONNECTED; + else + { + int i, requeue_idx; + netif_tx_request_t *tx; + + spin_lock_irq(&np->rx_lock); + spin_lock(&np->tx_lock); + + /* Recovery procedure: */ + + /* Step 1: Reinitialise variables. */ + np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; + np->rx->event = 1; + + /* Step 2: Rebuild the RX and TX ring contents. + * NB. We could just free the queued TX packets now but we hope + * that sending them out might do some good. We have to rebuild + * the RX ring because some of our pages are currently flipped out + * so we can't just free the RX skbs. + * NB2. Freelist index entries are always going to be less than + * __PAGE_OFFSET, whereas pointers to skbs will always be equal or + * greater than __PAGE_OFFSET: we use this property to distinguish + * them. + */ + + /* Rebuild the TX buffer freelist and the TX ring itself. + * NB. This reorders packets. We could keep more private state + * to avoid this but maybe it doesn't matter so much given the + * interface has been down. + */ + for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ ) + { + if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET ) + { + struct sk_buff *skb = np->tx_skbs[i]; + + tx = &np->tx->ring[requeue_idx++].req; + + tx->id = i; + tx->addr = virt_to_machine(skb->data); + tx->size = skb->len; + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + } + } + wmb(); + np->tx->req_prod = requeue_idx; + + /* Rebuild the RX buffer freelist and the RX ring itself. */ + for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ ) + if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET ) + np->rx->ring[requeue_idx++].req.id = i; + wmb(); + np->rx->req_prod = requeue_idx; + + /* Step 3: All public and private state should now be sane. Get + * ready to start sending and receiving packets and give the driver + * domain a kick because we've probably just requeued some + * packets. + */ + netif_carrier_on(dev); + netif_start_queue(dev); + np->state = NETIF_STATE_ACTIVE; + + notify_via_evtchn(status->evtchn); + + network_tx_buf_gc(dev); + + printk(KERN_INFO "Recovery completed\n"); + + spin_unlock(&np->tx_lock); + spin_unlock_irq(&np->rx_lock); + } + + np->evtchn = status->evtchn; + np->irq = bind_evtchn_to_irq(np->evtchn); + (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, + dev->name, dev); + break; + + default: + printk(KERN_WARNING "Status change to unknown value %d\n", + status->status); + break; + } +} + + +static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) +{ + switch ( msg->subtype ) + { + case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED: + if ( msg->length != sizeof(netif_fe_interface_status_changed_t) ) + goto parse_error; + netif_status_change((netif_fe_interface_status_changed_t *) + &msg->msg[0]); + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} + + +static int __init init_module(void) +{ + ctrl_msg_t cmsg; + netif_fe_driver_status_changed_t st; + int err; + struct net_device *dev; + struct net_private *np; + + if ( start_info.flags & SIF_INITDOMAIN + || start_info.flags & SIF_NET_BE_DOMAIN ) + return 0; + + printk("Initialising Xen virtual ethernet frontend driver"); + + INIT_LIST_HEAD(&dev_list); + + if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL ) + { + err = -ENOMEM; + goto fail; + } + + np = dev->priv; + np->state = NETIF_STATE_CLOSED; + np->handle = 0; + + dev->open = network_open; + dev->hard_start_xmit = network_start_xmit; + dev->stop = network_close; + dev->get_stats = network_get_stats; + dev->poll = netif_poll; + dev->weight = 64; + + if ( (err = register_netdev(dev)) != 0 ) + { + kfree(dev); + goto fail; + } + + np->dev = dev; + list_add(&np->list, &dev_list); + + (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); + + /* Send a driver-UP notification to the domain controller. */ + cmsg.type = CMSG_NETIF_FE; + cmsg.subtype = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED; + cmsg.length = sizeof(netif_fe_driver_status_changed_t); + st.status = NETIF_DRIVER_STATUS_UP; + memcpy(cmsg.msg, &st, sizeof(st)); + + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); + + /* + * We should read 'nr_interfaces' from response message and wait + * for notifications before proceeding. For now we assume that we + * will be notified of exactly one interface. + */ + while ( np->state != NETIF_STATE_CONNECTED ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + return 0; + + fail: + cleanup_module(); + return err; +} + + +static void cleanup_module(void) +{ + /* XXX FIXME */ + BUG(); +} + + +module_init(init_module); +module_exit(cleanup_module); diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/netif.h b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/netif.h new file mode 100644 index 0000000000..098b292612 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/netif/netif.h @@ -0,0 +1,88 @@ +/****************************************************************************** + * netif.h + * + * Unified network-device I/O interface for Xen guest OSes. + * + * Copyright (c) 2003-2004, Keir Fraser + */ + +#ifndef __SHARED_NETIF_H__ +#define __SHARED_NETIF_H__ + +typedef struct { + memory_t addr; /* 0: Machine address of packet. */ + MEMORY_PADDING; + u16 id; /* 8: Echoed in response message. */ + u16 size; /* 10: Packet size in bytes. */ +} PACKED netif_tx_request_t; /* 12 bytes */ + +typedef struct { + u16 id; /* 0 */ + s8 status; /* 2 */ + u8 __pad; /* 3 */ +} PACKED netif_tx_response_t; /* 4 bytes */ + +typedef struct { + u16 id; /* 0: Echoed in response message. */ +} PACKED netif_rx_request_t; /* 2 bytes */ + +typedef struct { + memory_t addr; /* 0: Machine address of packet. */ + MEMORY_PADDING; + u16 id; /* 8: */ + s16 status; /* 10: -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */ +} PACKED netif_rx_response_t; /* 12 bytes */ + +/* + * We use a special capitalised type name because it is _essential_ that all + * arithmetic on indexes is done on an integer type of the correct size. + */ +typedef u32 NETIF_RING_IDX; + +/* + * Ring indexes are 'free running'. That is, they are not stored modulo the + * size of the ring buffer. The following macros convert a free-running counter + * into a value that can directly index a ring-buffer array. + */ +#define MASK_NETIF_RX_IDX(_i) ((_i)&(NETIF_RX_RING_SIZE-1)) +#define MASK_NETIF_TX_IDX(_i) ((_i)&(NETIF_TX_RING_SIZE-1)) + +#define NETIF_TX_RING_SIZE 256 +#define NETIF_RX_RING_SIZE 256 + +/* This structure must fit in a memory page. */ +typedef struct { + /* + * Frontend places packets into ring at tx_req_prod. + * Frontend receives event when tx_resp_prod passes tx_event. + */ + NETIF_RING_IDX req_prod; /* 0 */ + NETIF_RING_IDX resp_prod; /* 4 */ + NETIF_RING_IDX event; /* 8 */ + union { /* 12 */ + netif_tx_request_t req; + netif_tx_response_t resp; + } PACKED ring[NETIF_TX_RING_SIZE]; +} PACKED netif_tx_interface_t; + +/* This structure must fit in a memory page. */ +typedef struct { + /* + * Frontend places empty buffers into ring at rx_req_prod. + * Frontend receives event when rx_resp_prod passes rx_event. + */ + NETIF_RING_IDX req_prod; /* 0 */ + NETIF_RING_IDX resp_prod; /* 4 */ + NETIF_RING_IDX event; /* 8 */ + union { /* 12 */ + netif_rx_request_t req; + netif_rx_response_t resp; + } PACKED ring[NETIF_RX_RING_SIZE]; +} PACKED netif_rx_interface_t; + +/* Descriptor status values */ +#define NETIF_RSP_DROPPED -2 +#define NETIF_RSP_ERROR -1 +#define NETIF_RSP_OKAY 0 + +#endif diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/network/Makefile b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/Makefile new file mode 100644 index 0000000000..2e4c1f4825 --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := network.o +include $(TOPDIR)/Rules.make diff --git a/linux-2.4.26-xen-sparse/arch/xen/drivers/network/network.c b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/network.c new file mode 100644 index 0000000000..e6c340685a --- /dev/null +++ b/linux-2.4.26-xen-sparse/arch/xen/drivers/network/network.c @@ -0,0 +1,656 @@ +/****************************************************************************** + * network.c + * + * Virtual network driver for XenoLinux. + * + * Copyright (c) 2002-2003, K A Fraser + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> + +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/init.h> + +#include <asm/io.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */ + +static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs); +static void network_tx_buf_gc(struct net_device *dev); +static void network_alloc_rx_buffers(struct net_device *dev); +static void cleanup_module(void); + +/* Dynamically-mapped IRQs. */ +static int network_irq, debug_irq; + +static struct list_head dev_list; + +struct net_private +{ + struct list_head list; + struct net_device *dev; + + struct net_device_stats stats; + NET_RING_IDX rx_resp_cons, tx_resp_cons; + unsigned int net_ring_fixmap_idx, tx_full; + net_ring_t *net_ring; + net_idx_t *net_idx; + spinlock_t tx_lock; + unsigned int idx; /* Domain-specific index of this VIF. */ + + unsigned int rx_bufs_to_notify; + +#define STATE_ACTIVE 0 +#define STATE_SUSPENDED 1 +#define STATE_CLOSED 2 + unsigned int state; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in each + * array is an index into a chain of free entries. + */ + struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1]; + struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1]; +}; + +/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ +#define ADD_ID_TO_FREELIST(_list, _id) \ + (_list)[(_id)] = (_list)[0]; \ + (_list)[0] = (void *)(unsigned long)(_id); +#define GET_ID_FROM_FREELIST(_list) \ + ({ unsigned long _id = (unsigned long)(_list)[0]; \ + (_list)[0] = (_list)[_id]; \ + (unsigned short)_id; }) + + +static void _dbg_network_int(struct net_device *dev) +{ + struct net_private *np = dev->priv; + + if ( np->state == STATE_CLOSED ) + return; + + printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x," + " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x," + " tx_event=0x%08x, state=%d\n", + np->tx_full, np->tx_resp_cons, + np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, + np->net_idx->tx_event, + test_bit(__LINK_STATE_XOFF, &dev->state)); + printk(KERN_ALERT "net: rx_resp_cons=0x%08x," + " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n", + np->rx_resp_cons, np->net_idx->rx_req_prod, + np->net_idx->rx_resp_prod, np->net_idx->rx_event); +} + + +static void dbg_network_int(int irq, void *unused, struct pt_regs *ptregs) +{ + struct list_head *ent; + struct net_private *np; + list_for_each ( ent, &dev_list ) + { + np = list_entry(ent, struct net_private, list); + _dbg_network_int(np->dev); + } +} + + +static int network_open(struct net_device *dev) +{ + struct net_private *np = dev->priv; + netop_t netop; + int i, ret; + + netop.cmd = NETOP_RESET_RINGS; + netop.vif = np->idx; + if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) + { + printk(KERN_ALERT "Possible net trouble: couldn't reset ring idxs\n"); + return ret; + } + + netop.cmd = NETOP_GET_VIF_INFO; + netop.vif = np->idx; + if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) + { + printk(KERN_ALERT "Couldn't get info for vif %d\n", np->idx); + return ret; + } + + memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); + + set_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx, + netop.u.get_vif_info.ring_mfn << PAGE_SHIFT); + np->net_ring = (net_ring_t *)fix_to_virt( + FIX_NETRING0_BASE + np->net_ring_fixmap_idx); + np->net_idx = &HYPERVISOR_shared_info->net_idx[np->idx]; + + np->rx_bufs_to_notify = 0; + np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; + memset(&np->stats, 0, sizeof(np->stats)); + spin_lock_init(&np->tx_lock); + memset(np->net_ring, 0, sizeof(*np->net_ring)); + memset(np->net_idx, 0, sizeof(*np->net_idx)); + + /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ + for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ ) + np->tx_skbs[i] = (void *)(i+1); + for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ ) + np->rx_skbs[i] = (void *)(i+1); + + wmb(); + np->state = STATE_ACTIVE; + + network_alloc_rx_buffers(dev); + + netif_start_queue(dev); + + MOD_INC_USE_COUNT; + + return 0; +} + + +static void network_tx_buf_gc(struct net_device *dev) +{ + NET_RING_IDX i, prod; + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + tx_entry_t *tx_ring = np->net_ring->tx_ring; + + do { + prod = np->net_idx->tx_resp_prod; + + for ( i = np->tx_resp_cons; i != prod; i++ ) + { + id = tx_ring[MASK_NET_TX_IDX(i)].resp.id; + skb = np->tx_skbs[id]; + ADD_ID_TO_FREELIST(np->tx_skbs, id); + dev_kfree_skb_any(skb); + } + + np->tx_resp_cons = prod; + + /* + * Set a new event, then check for race with update of tx_cons. Note + * that it is essential to schedule a callback, no matter how few + * buffers are pending. Even if there is space in the transmit ring, + * higher layers may be blocked because too much data is outstanding: + * in such cases notification from Xen is likely to be the only kick + * that we'll get. + */ + np->net_idx->tx_event = + prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1; + mb(); + } + while ( prod != np->net_idx->tx_resp_prod ); + + if ( np->tx_full && + ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) ) + { + np->tx_full = 0; + if ( np->state == STATE_ACTIVE ) + netif_wake_queue(dev); + } +} + + +static inline pte_t *get_ppte(void *addr) +{ + pgd_t *pgd; pmd_t *pmd; pte_t *pte; + pgd = pgd_offset_k( (unsigned long)addr); + pmd = pmd_offset(pgd, (unsigned long)addr); + pte = pte_offset(pmd, (unsigned long)addr); + return pte; +} + + +static void network_alloc_rx_buffers(struct net_device *dev) +{ + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + netop_t netop; + NET_RING_IDX i = np->net_idx->rx_req_prod; + + if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || + unlikely(np->state != STATE_ACTIVE) ) + return; + + do { + skb = dev_alloc_skb(RX_BUF_SIZE); + if ( unlikely(skb == NULL) ) + break; + + skb->dev = dev; + + if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) ) + panic("alloc_skb needs to provide us page-aligned buffers."); + + id = GET_ID_FROM_FREELIST(np->rx_skbs); + np->rx_skbs[id] = skb; + + np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id; + np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = + virt_to_machine(get_ppte(skb->head)); + + /* Shadow optimisation: disown this page from p->m map */ + phys_to_machine_mapping[virt_to_phys(skb->head)>>PAGE_SHIFT] = 0x80000004; + np->rx_bufs_to_notify++; + } + while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE ); + + /* + * We may have allocated buffers which have entries outstanding in the page + * update queue -- make sure we flush those first! + */ + flush_page_update_queue(); + + np->net_idx->rx_req_prod = i; + np->net_idx->rx_event = np->rx_resp_cons + 1; + + /* Batch Xen notifications. */ + if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) ) + { + netop.cmd = NETOP_PUSH_BUFFERS; + netop.vif = np->idx; + (void)HYPERVISOR_net_io_op(&netop); + np->rx_bufs_to_notify = 0; + } +} + + +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned short id; + struct net_private *np = (struct net_private *)dev->priv; + tx_req_entry_t *tx; + netop_t netop; + NET_RING_IDX i; + + if ( unlikely(np->tx_full) ) + { + printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); + netif_stop_queue(dev); + return -ENOBUFS; + } + + if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= + PAGE_SIZE) ) + { + struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE); + if ( unlikely(new_skb == NULL) ) + return 1; + skb_put(new_skb, skb->len); + memcpy(new_skb->data, skb->data, skb->len); + dev_kfree_skb(skb); + skb = new_skb; + } + + spin_lock_irq(&np->tx_lock); + + i = np->net_idx->tx_req_prod; + + id = GET_ID_FROM_FREELIST(np->tx_skbs); + np->tx_skbs[id] = skb; + + tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req; + + tx->id = id; + tx->addr = phys_to_machine(virt_to_phys(skb->data)); + tx->size = skb->len; + + wmb(); + np->net_idx->tx_req_prod = i + 1; + + network_tx_buf_gc(dev); + + if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) ) + { + np->tx_full = 1; + netif_stop_queue(dev); + } + + spin_unlock_irq(&np->tx_lock); + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + + /* Only notify Xen if there are no outstanding responses. */ + mb(); + if ( np->net_idx->tx_resp_prod == i ) + { + netop.cmd = NETOP_PUSH_BUFFERS; + netop.vif = np->idx; + (void)HYPERVISOR_net_io_op(&netop); + } + + return 0; +} + + +static inline void _network_interrupt(struct net_device *dev) +{ + struct net_private *np = dev->priv; + unsigned long flags; + struct sk_buff *skb; + rx_resp_entry_t *rx; + NET_RING_IDX i; + + if ( unlikely(np->state == STATE_CLOSED) ) + return; + + spin_lock_irqsave(&np->tx_lock, flags); + network_tx_buf_gc(dev); + spin_unlock_irqrestore(&np->tx_lock, flags); + + again: + for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ ) + { + rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp; + + skb = np->rx_skbs[rx->id]; + ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); + +/* XXXXX this is unsafe for live migrate -- if we do a scan be fore this +point we won't transmit the right mfn! We have to fix this up in +xc_linux_save */ + + phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = + (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; + + if ( unlikely(rx->status != RING_STATUS_OK) ) + { + /* Gate this error. We get a (valid) slew of them on suspend. */ + if ( np->state == STATE_ACTIVE ) + { + /* With live migrate, we even get these... Disable for now. */ + // printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status); + } + else + phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = + 0x80000004; // disown this page -- it was a flush + + dev_kfree_skb_any(skb); + continue; + } + + /* + * Set up shinfo -- from alloc_skb This was particularily nasty: the + * shared info is hidden at the back of the data area (presumably so it + * can be shared), but on page flip it gets very spunked. + */ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; + + skb->data = skb->tail = skb->head + rx->offset; + skb_put(skb, rx->size); + skb->protocol = eth_type_trans(skb, dev); + + np->stats.rx_packets++; + + np->stats.rx_bytes += rx->size; + netif_rx(skb); + dev->last_rx = jiffies; + } + + np->rx_resp_cons = i; + + network_alloc_rx_buffers(dev); + + /* Deal with hypervisor racing our resetting of rx_event. */ + mb(); + if ( np->net_idx->rx_resp_prod != i ) + goto again; +} + + +static void network_interrupt(int irq, void *unused, struct pt_regs *ptregs) +{ + struct list_head *ent; + struct net_private *np; + list_for_each ( ent, &dev_list ) + { + np = list_entry(ent, struct net_private, list); + _network_interrupt(np->dev); + } +} + + +static int network_close(struct net_device *dev) +{ + struct net_private *np = dev->priv; + netop_t netop; + + np->state = STATE_SUSPENDED; + wmb(); + + netif_stop_queue(np->dev); + + netop.cmd = NETOP_FLUSH_BUFFERS; + netop.vif = np->idx; + (void)HYPERVISOR_net_io_op(&netop); + + while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) || + (np->tx_resp_cons != np->net_idx->tx_req_prod) ) + { + barrier(); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + + wmb(); + np->state = STATE_CLOSED; + wmb(); + + /* Now no longer safe to take interrupts for this device. */ + clear_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static struct net_device_stats *network_get_stats(struct net_device *dev) +{ + struct net_private *np = (struct net_private *)dev->priv; + return &np->stats; +} + + +/* + * This notifier is installed for domain 0 only. + * All other domains have VFR rules installed on their behalf by domain 0 + * when they are created. For bootstrap, Xen creates wildcard rules for + * domain 0 -- this notifier is used to detect when we find our proper + * IP address, so we can poke down proper rules and remove the wildcards. + */ +static int inetdev_notify(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct list_head *ent; + struct net_private *np; + int idx = -1; + network_op_t op; + + list_for_each ( ent, &dev_list ) + { + np = list_entry(dev_list.next, struct net_private, list); + if ( np->dev == dev ) + idx = np->idx; + } + + if ( idx == -1 ) + goto out; + + memset(&op, 0, sizeof(op)); + op.u.net_rule.proto = NETWORK_PROTO_ANY; + op.u.net_rule.action = NETWORK_ACTION_ACCEPT; + + if ( event == NETDEV_UP ) + op.cmd = NETWORK_OP_ADDRULE; + else if ( event == NETDEV_DOWN ) + op.cmd = NETWORK_OP_DELETERULE; + else + goto out; + + op.u.net_rule.src_dom = 0; + op.u.net_rule.src_idx = idx; + op.u.net_rule.dst_dom = VIF_SPECIAL; + op.u.net_rule.dst_idx = VIF_PHYSICAL_INTERFACE; + op.u.net_rule.src_addr = ntohl(ifa->ifa_address); + op.u.net_rule.src_addr_mask = ~0UL; + op.u.net_rule.dst_addr = 0; + op.u.net_rule.dst_addr_mask = 0; + (void)HYPERVISOR_network_op(&op); + + op.u.net_rule.src_dom = VIF_SPECIAL; + op.u.net_rule.src_idx = VIF_ANY_INTERFACE; + op.u.net_rule.dst_dom = 0; + op.u.net_rule.dst_idx = idx; + op.u.net_rule.src_addr = 0; + op.u.net_rule.src_addr_mask = 0; + op.u.net_rule.dst_addr = ntohl(ifa->ifa_address); + op.u.net_rule.dst_addr_mask = ~0UL; + (void)HYPERVISOR_network_op(&op); + + out: + return NOTIFY_DONE; +} + +static struct notifier_block notifier_inetdev = { + .notifier_call = inetdev_notify, + .next = NULL, + .priority = 0 +}; + + +static int __init init_module(void) +{ + int i, fixmap_idx=-1, err; + struct net_device *dev; + struct net_private *np; + netop_t netop; + + INIT_LIST_HEAD(&dev_list); + + /* + * Domain 0 must poke its own network rules as it discovers its IP + * addresses. All other domains have a privileged "parent" to do this for + * them at start of day. + */ + if ( start_info.flags & SIF_INITDOMAIN ) + (void)register_inetaddr_notifier(¬ifier_inetdev); + + network_irq = bind_virq_to_irq(VIRQ_NET); + debug_irq = bind_virq_to_irq(VIRQ_DEBUG); + + err = request_irq(network_irq, network_interrupt, + SA_SAMPLE_RANDOM, "network", NULL); + if ( err ) + { + printk(KERN_WARNING "Could not allocate network interrupt\n"); + goto fail; + } + + err = request_irq(debug_irq, dbg_network_int, + SA_SHIRQ, "net_dbg", &dbg_network_int); + if ( err ) + printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); + + for ( i = 0; i < MAX_DOMAIN_VIFS; i++ ) + { + /* If the VIF is invalid then the query hypercall will fail. */ + netop.cmd = NETOP_GET_VIF_INFO; + netop.vif = i; + if ( HYPERVISOR_net_io_op(&netop) != 0 ) + continue; + + /* We actually only support up to 4 vifs right now. */ + if ( ++fixmap_idx == 4 ) + break; + + dev = alloc_etherdev(sizeof(struct net_private)); + if ( dev == NULL ) + { + err = -ENOMEM; + goto fail; + } + + np = dev->priv; + np->state = STATE_CLOSED; + np->net_ring_fixmap_idx = fixmap_idx; + np->idx = i; + + SET_MODULE_OWNER(dev); + dev->open = network_open; + dev->hard_start_xmit = network_start_xmit; + dev->stop = network_close; + dev->get_stats = network_get_stats; + + memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); + + if ( (err = register_netdev(dev)) != 0 ) + { + kfree(dev); + goto fail; + } + + np->dev = dev; + list_add(&np->list, &dev_list); + } + + return 0; + + fail: + cleanup_module(); + return err; +} + + +static void cleanup_module(void) +{ + struct net_private *np; + struct net_device *dev; + + while ( !list_empty(&dev_list) ) + { + np = list_entry(dev_list.next, struct net_private, list); + list_del(&np->list); + dev = np->dev; + unregister_netdev(dev); + kfree(dev); + } + + if ( start_info.flags & SIF_INITDOMAIN ) + (void)unregister_inetaddr_notifier(¬ifier_inetdev); + + free_irq(network_irq, NULL); + free_irq(debug_irq, NULL); + + unbind_virq_from_irq(VIRQ_NET); + unbind_virq_from_irq(VIRQ_DEBUG); +} + + +module_init(init_module); +module_exit(cleanup_module); |