/******************************************************************************
 * arch/xen/drivers/netif/frontend/main.c
 * 
 * Virtual network driver for XenoLinux.
 * 
 * Copyright (c) 2002-2004, K A Fraser
 */

#include <linux/config.h>
#include <linux/module.h>

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>

#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/init.h>

#include <asm/io.h>
#include <net/sock.h>
#include <net/pkt_sched.h>

#include <asm/evtchn.h>
#include <asm/ctrl_if.h>

#include <asm/page.h>

#include "../netif.h"

#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */

static void network_tx_buf_gc(struct net_device *dev);
static void network_alloc_rx_buffers(struct net_device *dev);
static void cleanup_module(void);

static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];

static struct list_head dev_list;

struct net_private
{
    struct list_head list;
    struct net_device *dev;

    struct net_device_stats stats;
    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
    unsigned int tx_full;
    
    netif_tx_interface_t *tx;
    netif_rx_interface_t *rx;

    spinlock_t   tx_lock;
    spinlock_t   rx_lock;

    unsigned int handle;
    unsigned int evtchn;
    unsigned int irq;

#define NETIF_STATE_CLOSED       0
#define NETIF_STATE_DISCONNECTED 1
#define NETIF_STATE_CONNECTED    2
#define NETIF_STATE_ACTIVE       3
    unsigned int state;

    /*
     * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
     * array is an index into a chain of free entries.
     */
    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
};

/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
#define ADD_ID_TO_FREELIST(_list, _id)             \
    (_list)[(_id)] = (_list)[0];                   \
    (_list)[0]     = (void *)(unsigned long)(_id);
#define GET_ID_FROM_FREELIST(_list)                \
 ({ unsigned long _id = (unsigned long)(_list)[0]; \
    (_list)[0]  = (_list)[_id];                    \
    (unsigned short)_id; })

static struct net_device *find_dev_by_handle(unsigned int handle)
{
    struct list_head *ent;
    struct net_private *np;
    list_for_each ( ent, &dev_list )
    {
        np = list_entry(ent, struct net_private, list);
        if ( np->handle == handle )
            return np->dev;
    }
    return NULL;
}


static int network_open(struct net_device *dev)
{
    struct net_private *np = dev->priv;
    int i;

    if ( np->state != NETIF_STATE_CONNECTED )
        return -EINVAL;

    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
    memset(&np->stats, 0, sizeof(np->stats));
    spin_lock_init(&np->tx_lock);
    spin_lock_init(&np->rx_lock);

    /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
        np->tx_skbs[i] = (void *)(i+1);
    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
        np->rx_skbs[i] = (void *)(i+1);

    wmb();
    np->state = NETIF_STATE_ACTIVE;

    network_alloc_rx_buffers(dev);
    np->rx->event = np->rx_resp_cons + 1;

    netif_start_queue(dev);

    MOD_INC_USE_COUNT;

    return 0;
}


static void network_tx_buf_gc(struct net_device *dev)
{
    NETIF_RING_IDX i, prod;
    unsigned short id;
    struct net_private *np = dev->priv;
    struct sk_buff *skb;

    do {
        prod = np->tx->resp_prod;

        for ( i = np->tx_resp_cons; i != prod; i++ )
        {
            id  = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id;
            skb = np->tx_skbs[id];
            ADD_ID_TO_FREELIST(np->tx_skbs, id);
            dev_kfree_skb_any(skb);
        }
        
        np->tx_resp_cons = prod;
        
        /*
         * Set a new event, then check for race with update of tx_cons. Note
         * that it is essential to schedule a callback, no matter how few
         * buffers are pending. Even if there is space in the transmit ring,
         * higher layers may be blocked because too much data is outstanding:
         * in such cases notification from Xen is likely to be the only kick
         * that we'll get.
         */
        np->tx->event = 
            prod + ((np->tx->req_prod - prod) >> 1) + 1;
        mb();
    }
    while ( prod != np->tx->resp_prod );

    if ( np->tx_full && 
         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
    {
        np->tx_full = 0;
        if ( np->state == NETIF_STATE_ACTIVE )
            netif_wake_queue(dev);
    }
}


static void network_alloc_rx_buffers(struct net_device *dev)
{
    unsigned short id;
    struct net_private *np = dev->priv;
    struct sk_buff *skb;
    NETIF_RING_IDX i = np->rx->req_prod;
    int nr_pfns = 0;

    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
         unlikely(np->state != NETIF_STATE_ACTIVE) )
        return;

    do {
        skb = dev_alloc_skb(RX_BUF_SIZE);
        if ( unlikely(skb == NULL) )
            break;

        skb->dev = dev;

        if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
            panic("alloc_skb needs to provide us page-aligned buffers.");

        id = GET_ID_FROM_FREELIST(np->rx_skbs);

        np->rx_skbs[id] = skb;
        
        np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id;
        
        rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;

        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
        rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
        rx_mcl[nr_pfns].args[1] = 0;
        rx_mcl[nr_pfns].args[2] = 0;

        nr_pfns++;
    }
    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );

    /*
     * We may have allocated buffers which have entries outstanding in the page
     * update queue -- make sure we flush those first!
     */
    flush_page_update_queue();

    /* After all PTEs have been zapped we blow away stale TLB entries. */
    rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;

    /* Give away a batch of pages. */
    rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
    rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
    rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
    rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;

    /* Zap PTEs and give away pages in one big multicall. */
    (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);

    /* Check return status of HYPERVISOR_dom_mem_op(). */
    if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
        panic("Unable to reduce memory reservation\n");

    np->rx->req_prod = i;
}


static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
    unsigned short id;
    struct net_private *np = (struct net_private *)dev->priv;
    netif_tx_request_t *tx;
    NETIF_RING_IDX i;

    if ( unlikely(np->tx_full) )
    {
        printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
        netif_stop_queue(dev);
        return -ENOBUFS;
    }

    if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
                  PAGE_SIZE) )
    {
        struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
        if ( unlikely(new_skb == NULL) )
            return 1;
        skb_put(new_skb, skb->len);
        memcpy(new_skb->data, skb->data, skb->len);
        dev_kfree_skb(skb);
        skb = new_skb;
    }
    
    spin_lock_irq(&np->tx_lock);

    /* if the backend isn't available then don't do anything! */
    if ( !netif_carrier_ok(dev) )
    {
        spin_unlock_irq(&np->tx_lock);
        return 1;
    }

    i = np->tx->req_prod;

    id = GET_ID_FROM_FREELIST(np->tx_skbs);
    np->tx_skbs[id] = skb;

    tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req;

    tx->id   = id;
    tx->addr = virt_to_machine(skb->data);
    tx->size = skb->len;

    wmb();
    np->tx->req_prod = i + 1;

    network_tx_buf_gc(dev);

    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
    {
        np->tx_full = 1;
        netif_stop_queue(dev);
    }

    spin_unlock_irq(&np->tx_lock);

    np->stats.tx_bytes += skb->len;
    np->stats.tx_packets++;

    /* Only notify Xen if there are no outstanding responses. */
    mb();
    if ( np->tx->resp_prod == i )
        notify_via_evtchn(np->evtchn);

    return 0;
}


static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
{
    struct net_device *dev = dev_id;
    struct net_private *np = dev->priv;
    unsigned long flags;

    spin_lock_irqsave(&np->tx_lock, flags);
    
    if( !netif_carrier_ok(dev) )
    {
        spin_unlock_irqrestore(&np->tx_lock, flags);
        return;
    }
    
    network_tx_buf_gc(dev);
    spin_unlock_irqrestore(&np->tx_lock, flags);

    if ( np->rx_resp_cons != np->rx->resp_prod )
        netif_rx_schedule(dev);
}


static int netif_poll(struct net_device *dev, int *pbudget)
{
    struct net_private *np = dev->priv;
    struct sk_buff *skb;
    netif_rx_response_t *rx;
    NETIF_RING_IDX i;
    mmu_update_t *mmu = rx_mmu;
    multicall_entry_t *mcl = rx_mcl;
    int work_done, budget, more_to_do = 1;
    struct sk_buff_head rxq;
    unsigned long flags;

    spin_lock(&np->rx_lock);

    /* if the device is undergoing recovery then don't do anything */
    if ( !netif_carrier_ok(dev) )
    {
        spin_unlock(&np->rx_lock);
        return 0;
    }

    skb_queue_head_init(&rxq);

    if ( (budget = *pbudget) > dev->quota )
        budget = dev->quota;

    for ( i = np->rx_resp_cons, work_done = 0; 
          (i != np->rx->resp_prod) && (work_done < budget); 
          i++, work_done++ )
    {
        rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp;

        skb = np->rx_skbs[rx->id];
        ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);

        if ( unlikely(rx->status <= 0) )
        {
            /* Gate this error. We get a (valid) slew of them on suspend. */
            if ( np->state == NETIF_STATE_ACTIVE )
                printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
            dev_kfree_skb(skb);
            continue;
        }

        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
        skb_put(skb, rx->status);

        np->stats.rx_packets++;
        np->stats.rx_bytes += rx->status;

        /* Remap the page. */
        mmu->ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
        mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
        mmu++;
        mcl->op = __HYPERVISOR_update_va_mapping;
        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
        mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
        mcl->args[2] = 0;
        mcl++;

        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
            rx->addr >> PAGE_SHIFT;

        __skb_queue_tail(&rxq, skb);
    }

    /* Do all the remapping work, and M->P updates, in one big hypercall. */
    if ( likely((mcl - rx_mcl) != 0) )
    {
        mcl->op = __HYPERVISOR_mmu_update;
        mcl->args[0] = (unsigned long)rx_mmu;
        mcl->args[1] = mmu - rx_mmu;
        mcl->args[2] = 0;
        mcl++;
        (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
    }

    while ( (skb = __skb_dequeue(&rxq)) != NULL )
    {
        /* Set the shared-info area, which is hidden behind the real data. */
        atomic_set(&(skb_shinfo(skb)->dataref), 1);
        skb_shinfo(skb)->nr_frags = 0;
        skb_shinfo(skb)->frag_list = NULL;

        /* Ethernet-specific work. Delayed to here as it peeks the header. */
        skb->protocol = eth_type_trans(skb, dev);

        /* Pass it up. */
        netif_rx(skb);
        dev->last_rx = jiffies;
    }

    np->rx_resp_cons = i;

    network_alloc_rx_buffers(dev);

    *pbudget   -= work_done;
    dev->quota -= work_done;

    if ( work_done < budget )
    {
        local_irq_save(flags);

        np->rx->event = i + 1;
    
        /* Deal with hypervisor racing our resetting of rx_event. */
        mb();
        if ( np->rx->resp_prod == i )
        {
            __netif_rx_complete(dev);
            more_to_do = 0;
        }

        local_irq_restore(flags);
    }

    spin_unlock(&np->rx_lock);

    return more_to_do;
}


static int network_close(struct net_device *dev)
{
    struct net_private *np = dev->priv;

    netif_stop_queue(np->dev);

    np->state = NETIF_STATE_CONNECTED;

    /* XXX We need to properly disconnect via the domain controller. */
    while ( /*(np->rx_resp_cons != np->rx->req_prod) ||*/
            (np->tx_resp_cons != np->tx->req_prod) )
    {
        barrier();
        current->state = TASK_INTERRUPTIBLE;
        schedule_timeout(1);
    }

    MOD_DEC_USE_COUNT;

    return 0;
}


static struct net_device_stats *network_get_stats(struct net_device *dev)
{
    struct net_private *np = (struct net_private *)dev->priv;
    return &np->stats;
}


static void netif_status_change(netif_fe_interface_status_changed_t *status)
{
    ctrl_msg_t                   cmsg;
    netif_fe_interface_connect_t up;
    struct net_device *dev;
    struct net_private *np;

    if ( status->handle != 0 )
    {
        printk(KERN_WARNING "Status change on unsupported netif %d\n",
               status->handle);
        return;
    }

    dev = find_dev_by_handle(0);
    np  = dev->priv;
    
    switch ( status->status )
    {
    case NETIF_INTERFACE_STATUS_DESTROYED:
        printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
               np->state);
        break;

    case NETIF_INTERFACE_STATUS_DISCONNECTED:
        if ( np->state != NETIF_STATE_CLOSED )
        {
            printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
                   " in state %d\n", np->state);
	    printk(KERN_INFO "Attempting to reconnect network interface\n");

            /* Begin interface recovery.
	     *
	     * NB. Whilst we're recovering, we turn the carrier state off.  We
	     * take measures to ensure that this device isn't used for
	     * anything.  We also stop the queue for this device.  Various
	     * different approaches (e.g. continuing to buffer packets) have
	     * been tested but don't appear to improve the overall impact on
             * TCP connections.
	     *
             * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
             * is initiated by a special "RESET" message - disconnect could
             * just mean we're not allowed to use this interface any more.
             */

            /* Stop old i/f to prevent errors whilst we rebuild the state. */
            spin_lock_irq(&np->tx_lock);
            spin_lock(&np->rx_lock);
            netif_stop_queue(dev);
            netif_carrier_off(dev);
            np->state  = NETIF_STATE_DISCONNECTED;
            spin_unlock(&np->rx_lock);
            spin_unlock_irq(&np->tx_lock);

            /* Free resources. */
            free_irq(np->irq, dev);
            unbind_evtchn_from_irq(np->evtchn);
	    free_page((unsigned long)np->tx);
            free_page((unsigned long)np->rx);
        }

        /* Move from CLOSED to DISCONNECTED state. */
        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
        memset(np->tx, 0, PAGE_SIZE);
        memset(np->rx, 0, PAGE_SIZE);
        np->state  = NETIF_STATE_DISCONNECTED;

        /* Construct an interface-CONNECT message for the domain controller. */
        cmsg.type      = CMSG_NETIF_FE;
        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
        cmsg.length    = sizeof(netif_fe_interface_connect_t);
        up.handle      = 0;
        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
        memcpy(cmsg.msg, &up, sizeof(up));
        
        /* Tell the controller to bring up the interface. */
        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
        break;

    case NETIF_INTERFACE_STATUS_CONNECTED:
        if ( np->state == NETIF_STATE_CLOSED )
        {
            printk(KERN_WARNING "Unexpected netif-CONNECTED message"
                   " in state %d\n", np->state);
            break;
        }

        memcpy(dev->dev_addr, status->mac, ETH_ALEN);

        if(netif_carrier_ok(dev))
            np->state = NETIF_STATE_CONNECTED;
        else
        {
            int i, requeue_idx;
            netif_tx_request_t *tx;

            spin_lock_irq(&np->rx_lock);
            spin_lock(&np->tx_lock);

            /* Recovery procedure: */

            /* Step 1: Reinitialise variables. */
            np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
            np->rx->event = 1;

            /* Step 2: Rebuild the RX and TX ring contents.
             * NB. We could just free the queued TX packets now but we hope
             * that sending them out might do some good.  We have to rebuild
             * the RX ring because some of our pages are currently flipped out
             * so we can't just free the RX skbs.
	     * NB2. Freelist index entries are always going to be less than
	     *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
	     * greater than __PAGE_OFFSET: we use this property to distinguish
             * them.
             */

            /* Rebuild the TX buffer freelist and the TX ring itself.
             * NB. This reorders packets.  We could keep more private state
             * to avoid this but maybe it doesn't matter so much given the
             * interface has been down.
             */
            for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
            {
                if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
                {
                    struct sk_buff *skb = np->tx_skbs[i];
                    
                    tx = &np->tx->ring[requeue_idx++].req;
                    
                    tx->id   = i;
                    tx->addr = virt_to_machine(skb->data);
                    tx->size = skb->len;
                    
                    np->stats.tx_bytes += skb->len;
                    np->stats.tx_packets++;
                }
            }
            wmb();
            np->tx->req_prod = requeue_idx;

            /* Rebuild the RX buffer freelist and the RX ring itself. */
            for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
                if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
                    np->rx->ring[requeue_idx++].req.id = i;
            wmb();                
            np->rx->req_prod = requeue_idx;

            /* Step 3: All public and private state should now be sane.  Get
             * ready to start sending and receiving packets and give the driver
             * domain a kick because we've probably just requeued some
             * packets.
             */
            netif_carrier_on(dev);
            netif_start_queue(dev);
            np->state = NETIF_STATE_ACTIVE;

            notify_via_evtchn(status->evtchn);  

	    network_tx_buf_gc(dev);

            printk(KERN_INFO "Recovery completed\n");

            spin_unlock(&np->tx_lock);
            spin_unlock_irq(&np->rx_lock);
        }

        np->evtchn = status->evtchn;
        np->irq = bind_evtchn_to_irq(np->evtchn);
        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
                          dev->name, dev);
        break;

    default:
        printk(KERN_WARNING "Status change to unknown value %d\n", 
               status->status);
        break;
    }
}


static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
{
    switch ( msg->subtype )
    {
    case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
        if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
            goto parse_error;
        netif_status_change((netif_fe_interface_status_changed_t *)
                            &msg->msg[0]);
        break;
    default:
        goto parse_error;
    }

    ctrl_if_send_response(msg);
    return;

 parse_error:
    msg->length = 0;
    ctrl_if_send_response(msg);
}


static int __init init_module(void)
{
    ctrl_msg_t                       cmsg;
    netif_fe_driver_status_changed_t st;
    int err;
    struct net_device *dev;
    struct net_private *np;

    if ( start_info.flags & SIF_INITDOMAIN
         || start_info.flags & SIF_NET_BE_DOMAIN )
        return 0;

    printk("Initialising Xen virtual ethernet frontend driver");

    INIT_LIST_HEAD(&dev_list);

    if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
    {
        err = -ENOMEM;
        goto fail;
    }

    np = dev->priv;
    np->state  = NETIF_STATE_CLOSED;
    np->handle = 0;

    dev->open            = network_open;
    dev->hard_start_xmit = network_start_xmit;
    dev->stop            = network_close;
    dev->get_stats       = network_get_stats;
    dev->poll            = netif_poll;
    dev->weight          = 64;
    
    if ( (err = register_netdev(dev)) != 0 )
    {
        kfree(dev);
        goto fail;
    }
    
    np->dev = dev;
    list_add(&np->list, &dev_list);

    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
                                    CALLBACK_IN_BLOCKING_CONTEXT);

    /* Send a driver-UP notification to the domain controller. */
    cmsg.type      = CMSG_NETIF_FE;
    cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
    cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
    st.status      = NETIF_DRIVER_STATUS_UP;
    memcpy(cmsg.msg, &st, sizeof(st));

    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
    
    /*
     * We should read 'nr_interfaces' from response message and wait
     * for notifications before proceeding. For now we assume that we
     * will be notified of exactly one interface.
     */
    while ( np->state != NETIF_STATE_CONNECTED )
    {
        set_current_state(TASK_INTERRUPTIBLE);
        schedule_timeout(1);
    }

    return 0;

 fail:
    cleanup_module();
    return err;
}


static void cleanup_module(void)
{
    /* XXX FIXME */
    BUG();
}


module_init(init_module);
module_exit(cleanup_module);