/******************************************************************************
 * arch/xen/drivers/blkif/backend/main.c
 * 
 * Back-end of the driver for virtual block devices. This portion of the
 * driver exports a 'unified' block-device interface that can be accessed
 * by any operating system that implements a compatible front end. A 
 * reference front-end implementation can be found in:
 *  arch/xen/drivers/blkif/frontend
 * 
 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
 */

#include "common.h"

/*
 * These are rather arbitrary. They are fairly large because adjacent requests
 * pulled from a communication ring are quite likely to end up being part of
 * the same scatter/gather request at the disc.
 * 
 * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
 * This will increase the chances of being able to write whole tracks.
 * 64 should be enough to keep us competitive with Linux.
 */
#define MAX_PENDING_REQS 64
#define BATCH_PER_DOMAIN 16

/*
 * NB. We place a page of padding between each buffer page to avoid incorrect
 * merging of requests by the IDE and SCSI merging routines. Otherwise, two
 * adjacent buffers in a scatter-gather request would have adjacent page
 * numbers: since the merge routines don't realise that this is in *pseudophys*
 * space, not real space, they may collapse the s-g elements!
 */
static unsigned long mmap_vstart;
#define MMAP_PAGES_PER_REQUEST \
    (2 * (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1))
#define MMAP_PAGES             \
    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
#define MMAP_VADDR(_req,_seg)                        \
    (mmap_vstart +                                   \
     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
     ((_seg) * 2 * PAGE_SIZE))

/*
 * Each outstanding request that we've passed to the lower device layers has a 
 * 'pending_req' allocated to it. Each buffer_head that completes decrements 
 * the pendcnt towards zero. When it hits zero, the specified domain has a 
 * response queued for it, with the saved 'id' passed back.
 */
typedef struct {
    blkif_t       *blkif;
    unsigned long  id;
    int            nr_pages;
    atomic_t       pendcnt;
    unsigned short operation;
    int            status;
} pending_req_t;

/*
 * We can't allocate pending_req's in order, since they may complete out of 
 * order. We therefore maintain an allocation ring. This ring also indicates 
 * when enough work has been passed down -- at that point the allocation ring 
 * will be empty.
 */
static pending_req_t pending_reqs[MAX_PENDING_REQS];
static unsigned char pending_ring[MAX_PENDING_REQS];
static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
/* NB. We use a different index type to differentiate from shared blk rings. */
typedef unsigned int PEND_RING_IDX;
#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
static PEND_RING_IDX pending_prod, pending_cons;
#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)

static kmem_cache_t *buffer_head_cachep;

static int do_block_io_op(blkif_t *blkif, int max_to_do);
static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
static void make_response(blkif_t *blkif, unsigned long id, 
                          unsigned short op, int st);

static void fast_flush_area(int idx, int nr_pages)
{
    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];
    int               i;

    for ( i = 0; i < nr_pages; i++ )
    {
        mcl[i].op = __HYPERVISOR_update_va_mapping;
        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
        mcl[i].args[1] = 0;
        mcl[i].args[2] = 0;
    }

    mcl[nr_pages-1].args[2] = UVMF_FLUSH_TLB;
    (void)HYPERVISOR_multicall(mcl, nr_pages);
}


/******************************************************************
 * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
 */

static struct list_head io_schedule_list;
static spinlock_t io_schedule_list_lock;

static int __on_blkdev_list(blkif_t *blkif)
{
    return blkif->blkdev_list.next != NULL;
}

static void remove_from_blkdev_list(blkif_t *blkif)
{
    unsigned long flags;
    if ( !__on_blkdev_list(blkif) ) return;
    spin_lock_irqsave(&io_schedule_list_lock, flags);
    if ( __on_blkdev_list(blkif) )
    {
        list_del(&blkif->blkdev_list);
        blkif->blkdev_list.next = NULL;
        blkif_put(blkif);
    }
    spin_unlock_irqrestore(&io_schedule_list_lock, flags);
}

static void add_to_blkdev_list_tail(blkif_t *blkif)
{
    unsigned long flags;
    if ( __on_blkdev_list(blkif) ) return;
    spin_lock_irqsave(&io_schedule_list_lock, flags);
    if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) )
    {
        list_add_tail(&blkif->blkdev_list, &io_schedule_list);
        blkif_get(blkif);
    }
    spin_unlock_irqrestore(&io_schedule_list_lock, flags);
}


/******************************************************************
 * SCHEDULER FUNCTIONS
 */

static void io_schedule(unsigned long unused)
{
    blkif_t          *blkif;
    struct list_head *ent;

    /* Queue up a batch of requests. */
    while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
            !list_empty(&io_schedule_list) )
    {
        ent = io_schedule_list.next;
        blkif = list_entry(ent, blkif_t, blkdev_list);
        blkif_get(blkif);
        remove_from_blkdev_list(blkif);
        if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) )
            add_to_blkdev_list_tail(blkif);
        blkif_put(blkif);
    }

    /* Push the batch through to disc. */
    run_task_queue(&tq_disk);
}

static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0);

static void maybe_trigger_io_schedule(void)
{
    /*
     * Needed so that two processes, who together make the following predicate
     * true, don't both read stale values and evaluate the predicate
     * incorrectly. Incredibly unlikely to stall the scheduler on x86, but...
     */
    smp_mb();

    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
         !list_empty(&io_schedule_list) )
        tasklet_schedule(&io_schedule_tasklet);
}


/******************************************************************
 * COMPLETION CALLBACK -- Called as bh->b_end_io()
 */

static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
{
    unsigned long flags;

    /* An error fails the entire request. */
    if ( !uptodate )
    {
        DPRINTK("Buffer not up-to-date at end of operation\n");
        pending_req->status = BLKIF_RSP_ERROR;
    }

    if ( atomic_dec_and_test(&pending_req->pendcnt) )
    {
        int pending_idx = pending_req - pending_reqs;
        fast_flush_area(pending_idx, pending_req->nr_pages);
        make_response(pending_req->blkif, pending_req->id,
                      pending_req->operation, pending_req->status);
        blkif_put(pending_req->blkif);
        spin_lock_irqsave(&pend_prod_lock, flags);
        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
        spin_unlock_irqrestore(&pend_prod_lock, flags);
        maybe_trigger_io_schedule();
    }
}

static void end_block_io_op(struct buffer_head *bh, int uptodate)
{
    __end_block_io_op(bh->b_private, uptodate);
    kmem_cache_free(buffer_head_cachep, bh);
}


/******************************************************************************
 * NOTIFICATION FROM GUEST OS.
 */

void blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
{
    blkif_t *blkif = dev_id;
    add_to_blkdev_list_tail(blkif);
    maybe_trigger_io_schedule();
}


/******************************************************************
 * DOWNWARD CALLS -- These interface with the block-device layer proper.
 */

static int do_block_io_op(blkif_t *blkif, int max_to_do)
{
    blkif_ring_t *blk_ring = blkif->blk_ring_base;
    blkif_request_t *req;
    BLK_RING_IDX i;
    int more_to_do = 0;

    /* Take items off the comms ring, taking care not to overflow. */
    for ( i = blkif->blk_req_cons; 
          (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != 
                                        BLK_RING_SIZE);
          i++ )
    {
        if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) )
        {
            more_to_do = 1;
            break;
        }
        
        req = &blk_ring->ring[MASK_BLK_IDX(i)].req;
        switch ( req->operation )
        {
        case BLKIF_OP_READ:
        case BLKIF_OP_WRITE:
            dispatch_rw_block_io(blkif, req);
            break;

        case BLKIF_OP_PROBE:
            dispatch_probe(blkif, req);
            break;

        default:
            DPRINTK("error: unknown block io operation [%d]\n",
                    blk_ring->ring[i].req.operation);
            make_response(blkif, blk_ring->ring[i].req.id, 
                          blk_ring->ring[i].req.operation, BLKIF_RSP_ERROR);
            break;
        }
    }

    blkif->blk_req_cons = i;
    return more_to_do;
}

static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
{
    int rsp = BLKIF_RSP_ERROR;
    int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];

    /* We expect one buffer only. */
    if ( unlikely(req->nr_segments != 1) )
        goto out;

    /* Make sure the buffer is page-sized. */
    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
         (blkif_last_sect(req->frame_and_sects[0]) != 7) )
        goto out;

    if ( HYPERVISOR_update_va_mapping_otherdomain(
        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
        (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
        0, blkif->domid) )
        goto out;

    rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
                    PAGE_SIZE / sizeof(vdisk_t));

 out:
    fast_flush_area(pending_idx, 1);
    make_response(blkif, req->id, req->operation, rsp);
}

static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
{
    extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
    struct buffer_head *bh;
    int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
    short nr_sects;
    unsigned long buffer, fas;
    int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
    pending_req_t *pending_req;
    unsigned long  remap_prot;
    multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST];

    /* We map virtual scatter/gather segments to physical segments. */
    int new_segs, nr_psegs = 0;
    phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];

    /* Check that number of segments is sane. */
    if ( unlikely(req->nr_segments == 0) || 
         unlikely(req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) )
    {
        DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments);
        goto bad_descriptor;
    }

    /*
     * Check each address/size pair is sane, and convert into a
     * physical device and block offset. Note that if the offset and size
     * crosses a virtual extent boundary, we may end up with more
     * physical scatter/gather segments than virtual segments.
     */
    for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
    {
        fas      = req->frame_and_sects[i];
        buffer   = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
        nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;

        if ( nr_sects <= 0 )
            goto bad_descriptor;

        phys_seg[nr_psegs].dev           = req->device;
        phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
        phys_seg[nr_psegs].buffer        = buffer;
        phys_seg[nr_psegs].nr_sects      = nr_sects;

        /* Translate the request into the relevant 'physical device' */
        new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation);
        if ( new_segs < 0 )
        { 
            DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
                    operation == READ ? "read" : "write", 
                    req->sector_number + tot_sects, 
                    req->sector_number + tot_sects + nr_sects, 
                    req->device); 
            goto bad_descriptor;
        }
  
        nr_psegs += new_segs;
        ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
    }

    /* Nonsensical zero-sized request? */
    if ( unlikely(nr_psegs == 0) )
        goto bad_descriptor;

    if ( operation == READ )
        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW;
    else
        remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED;

    for ( i = 0; i < nr_psegs; i++ )
    {
        mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
        mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
        mcl[i].args[2] = 0;
        mcl[i].args[3] = blkif->domid;

        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
            phys_seg[i].buffer >> PAGE_SHIFT;
    }

    (void)HYPERVISOR_multicall(mcl, nr_psegs);

    for ( i = 0; i < nr_psegs; i++ )
    {
        if ( unlikely(mcl[i].args[5] != 0) )
        {
            DPRINTK("invalid buffer -- could not remap it\n");
            fast_flush_area(pending_idx, nr_psegs);
            goto bad_descriptor;
        }
    }

    pending_req = &pending_reqs[pending_idx];
    pending_req->blkif     = blkif;
    pending_req->id        = req->id;
    pending_req->operation = operation;
    pending_req->status    = BLKIF_RSP_OKAY;
    pending_req->nr_pages  = nr_psegs;
    atomic_set(&pending_req->pendcnt, nr_psegs);
    pending_cons++;

    blkif_get(blkif);

    /* Now we pass each segment down to the real blkdev layer. */
    for ( i = 0; i < nr_psegs; i++ )
    {
        bh = kmem_cache_alloc(buffer_head_cachep, GFP_ATOMIC);
        if ( unlikely(bh == NULL) )
        {
            __end_block_io_op(pending_req, 0);
            continue;
        }
        memset(bh, 0, sizeof (struct buffer_head));

        init_waitqueue_head(&bh->b_wait);
        bh->b_size          = phys_seg[i].nr_sects << 9;
        bh->b_dev           = phys_seg[i].dev;
        bh->b_rdev          = phys_seg[i].dev;
        bh->b_rsector       = (unsigned long)phys_seg[i].sector_number;
        bh->b_data          = (char *)MMAP_VADDR(pending_idx, i) +
            (phys_seg[i].buffer & ~PAGE_MASK);
        bh->b_page          = virt_to_page(MMAP_VADDR(pending_idx, i));
        bh->b_end_io        = end_block_io_op;
        bh->b_private       = pending_req;

        bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | 
            (1 << BH_Req) | (1 << BH_Launder);
        if ( operation == WRITE )
            bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate);

        atomic_set(&bh->b_count, 1);

        /* Dispatch a single request. We'll flush it to disc later. */
        generic_make_request(operation, bh);
    }

    return;

 bad_descriptor:
    make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
} 


/******************************************************************
 * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
 */


static void make_response(blkif_t *blkif, unsigned long id, 
                          unsigned short op, int st)
{
    blkif_response_t *resp;
    unsigned long     flags;

    /* Place on the response ring for the relevant domain. */ 
    spin_lock_irqsave(&blkif->blk_ring_lock, flags);
    resp = &blkif->blk_ring_base->
        ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp;
    resp->id        = id;
    resp->operation = op;
    resp->status    = st;
    wmb();
    blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod;
    spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);

    /* Kick the relevant domain. */
    notify_via_evtchn(blkif->evtchn);
}

void blkif_deschedule(blkif_t *blkif)
{
    remove_from_blkdev_list(blkif);
}

static int __init init_module(void)
{
    int i;

    if ( !(start_info.flags & SIF_INITDOMAIN)
	 && !(start_info.flags & SIF_BLK_BE_DOMAIN) )
        return 0;

    blkif_interface_init();

    if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
        BUG();

    pending_cons = 0;
    pending_prod = MAX_PENDING_REQS;
    memset(pending_reqs, 0, sizeof(pending_reqs));
    for ( i = 0; i < MAX_PENDING_REQS; i++ )
        pending_ring[i] = i;
    
    spin_lock_init(&io_schedule_list_lock);
    INIT_LIST_HEAD(&io_schedule_list);

    buffer_head_cachep = kmem_cache_create(
        "buffer_head_cache", sizeof(struct buffer_head),
        0, SLAB_HWCACHE_ALIGN, NULL, NULL);

    blkif_ctrlif_init();

    return 0;
}

static void cleanup_module(void)
{
    BUG();
}

module_init(init_module);
module_exit(cleanup_module);