/****************************************************************************** * network.c * * Virtual network driver for XenoLinux. * * Copyright (c) 2002-2003, K A Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NET_IRQ _EVENT_NET #define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */ static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs); static void network_tx_buf_gc(struct net_device *dev); static void network_alloc_rx_buffers(struct net_device *dev); static void cleanup_module(void); static struct list_head dev_list; struct net_private { struct list_head list; struct net_device *dev; struct net_device_stats stats; NET_RING_IDX rx_resp_cons, tx_resp_cons; unsigned int net_ring_fixmap_idx, tx_full; net_ring_t *net_ring; net_idx_t *net_idx; spinlock_t tx_lock; unsigned int idx; /* Domain-specific index of this VIF. */ unsigned int rx_bufs_to_notify; #define STATE_ACTIVE 0 #define STATE_SUSPENDED 1 #define STATE_CLOSED 2 unsigned int state; /* * {tx,rx}_skbs store outstanding skbuffs. The first entry in each * array is an index into a chain of free entries. */ struct sk_buff *tx_skbs[TX_RING_SIZE+1]; struct sk_buff *rx_skbs[RX_RING_SIZE+1]; }; /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ #define ADD_ID_TO_FREELIST(_list, _id) \ (_list)[(_id)] = (_list)[0]; \ (_list)[0] = (void *)(unsigned long)(_id); #define GET_ID_FROM_FREELIST(_list) \ ({ unsigned long _id = (unsigned long)(_list)[0]; \ (_list)[0] = (_list)[_id]; \ (unsigned short)_id; }) static void _dbg_network_int(struct net_device *dev) { struct net_private *np = dev->priv; if ( np->state == STATE_CLOSED ) return; printk(KERN_ALERT "tx_full = %d, tx_resp_cons = 0x%08x," " tx_req_prod = 0x%08x, tx_resp_prod = 0x%08x," " tx_event = 0x%08x, state=%d\n", np->tx_full, np->tx_resp_cons, np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, np->net_idx->tx_event, test_bit(__LINK_STATE_XOFF, &dev->state)); printk(KERN_ALERT "rx_resp_cons = 0x%08x," " rx_req_prod = 0x%08x, rx_resp_prod = 0x%08x, rx_event = 0x%08x\n", np->rx_resp_cons, np->net_idx->rx_req_prod, np->net_idx->rx_resp_prod, np->net_idx->rx_event); } static void dbg_network_int(int irq, void *unused, struct pt_regs *ptregs) { struct list_head *ent; struct net_private *np; list_for_each ( ent, &dev_list ) { np = list_entry(ent, struct net_private, list); _dbg_network_int(np->dev); } } static int network_open(struct net_device *dev) { struct net_private *np = dev->priv; netop_t netop; int i, ret; netop.cmd = NETOP_RESET_RINGS; netop.vif = np->idx; if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) { printk(KERN_ALERT "Possible net trouble: couldn't reset ring idxs\n"); return ret; } netop.cmd = NETOP_GET_VIF_INFO; netop.vif = np->idx; if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) { printk(KERN_ALERT "Couldn't get info for vif %d\n", np->idx); return ret; } memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); set_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx, netop.u.get_vif_info.ring_mfn << PAGE_SHIFT); np->net_ring = (net_ring_t *)fix_to_virt( FIX_NETRING0_BASE + np->net_ring_fixmap_idx); np->net_idx = &HYPERVISOR_shared_info->net_idx[np->idx]; np->rx_bufs_to_notify = 0; np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; memset(&np->stats, 0, sizeof(np->stats)); spin_lock_init(&np->tx_lock); memset(np->net_ring, 0, sizeof(*np->net_ring)); memset(np->net_idx, 0, sizeof(*np->net_idx)); /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ for ( i = 0; i <= TX_RING_SIZE; i++ ) np->tx_skbs[i] = (void *)(i+1); for ( i = 0; i <= RX_RING_SIZE; i++ ) np->rx_skbs[i] = (void *)(i+1); wmb(); np->state = STATE_ACTIVE; network_alloc_rx_buffers(dev); netif_start_queue(dev); MOD_INC_USE_COUNT; return 0; } static void network_tx_buf_gc(struct net_device *dev) { NET_RING_IDX i, prod; unsigned short id; struct net_private *np = dev->priv; struct sk_buff *skb; tx_entry_t *tx_ring = np->net_ring->tx_ring; do { prod = np->net_idx->tx_resp_prod; for ( i = np->tx_resp_cons; i != prod; i++ ) { id = tx_ring[MASK_NET_TX_IDX(i)].resp.id; skb = np->tx_skbs[id]; ADD_ID_TO_FREELIST(np->tx_skbs, id); dev_kfree_skb_any(skb); } np->tx_resp_cons = prod; /* * Set a new event, then check for race with update of tx_cons. Note * that it is essential to schedule a callback, no matter how few * buffers are pending. Even if there is space in the transmit ring, * higher layers may be blocked because too much data is outstanding: * in such cases notification from Xen is likely to be the only kick * that we'll get. */ np->net_idx->tx_event = prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1; mb(); } while ( prod != np->net_idx->tx_resp_prod ); if ( np->tx_full && ((np->net_idx->tx_req_prod - prod) < TX_RING_SIZE) ) { np->tx_full = 0; if ( np->state == STATE_ACTIVE ) netif_wake_queue(dev); } } static inline pte_t *get_ppte(void *addr) { pgd_t *pgd; pmd_t *pmd; pte_t *pte; pgd = pgd_offset_k( (unsigned long)addr); pmd = pmd_offset(pgd, (unsigned long)addr); pte = pte_offset(pmd, (unsigned long)addr); return pte; } static void network_alloc_rx_buffers(struct net_device *dev) { unsigned short id; struct net_private *np = dev->priv; struct sk_buff *skb; netop_t netop; NET_RING_IDX i = np->net_idx->rx_req_prod; if ( unlikely((i - np->rx_resp_cons) == RX_RING_SIZE) || unlikely(np->state != STATE_ACTIVE) ) return; do { skb = dev_alloc_skb(RX_BUF_SIZE); if ( unlikely(skb == NULL) ) break; skb->dev = dev; if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) ) panic("alloc_skb needs to provide us page-aligned buffers."); id = GET_ID_FROM_FREELIST(np->rx_skbs); np->rx_skbs[id] = skb; np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id; np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = virt_to_machine(get_ppte(skb->head)); np->rx_bufs_to_notify++; } while ( (++i - np->rx_resp_cons) != RX_RING_SIZE ); /* * We may have allocated buffers which have entries outstanding in the page * update queue -- make sure we flush those first! */ flush_page_update_queue(); np->net_idx->rx_req_prod = i; np->net_idx->rx_event = np->rx_resp_cons + 1; /* Batch Xen notifications. */ if ( np->rx_bufs_to_notify > (RX_RING_SIZE/4) ) { netop.cmd = NETOP_PUSH_BUFFERS; netop.vif = np->idx; (void)HYPERVISOR_net_io_op(&netop); np->rx_bufs_to_notify = 0; } } static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned short id; struct net_private *np = (struct net_private *)dev->priv; tx_req_entry_t *tx; netop_t netop; NET_RING_IDX i; if ( unlikely(np->tx_full) ) { printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); netif_stop_queue(dev); return -ENOBUFS; } if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= PAGE_SIZE) ) { struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE); if ( unlikely(new_skb == NULL) ) return 1; skb_put(new_skb, skb->len); memcpy(new_skb->data, skb->data, skb->len); dev_kfree_skb(skb); skb = new_skb; } spin_lock_irq(&np->tx_lock); i = np->net_idx->tx_req_prod; id = GET_ID_FROM_FREELIST(np->tx_skbs); np->tx_skbs[id] = skb; tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req; tx->id = id; tx->addr = phys_to_machine(virt_to_phys(skb->data)); tx->size = skb->len; wmb(); np->net_idx->tx_req_prod = i + 1; network_tx_buf_gc(dev); if ( (i - np->tx_resp_cons) == (TX_RING_SIZE - 1) ) { np->tx_full = 1; netif_stop_queue(dev); } spin_unlock_irq(&np->tx_lock); np->stats.tx_bytes += skb->len; np->stats.tx_packets++; /* Only notify Xen if there are no outstanding responses. */ mb(); if ( np->net_idx->tx_resp_prod == i ) { netop.cmd = NETOP_PUSH_BUFFERS; netop.vif = np->idx; (void)HYPERVISOR_net_io_op(&netop); } return 0; } static inline void _network_interrupt(struct net_device *dev) { struct net_private *np = dev->priv; unsigned long flags; struct sk_buff *skb; rx_resp_entry_t *rx; NET_RING_IDX i; if ( unlikely(np->state == STATE_CLOSED) ) return; spin_lock_irqsave(&np->tx_lock, flags); network_tx_buf_gc(dev); spin_unlock_irqrestore(&np->tx_lock, flags); again: for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ ) { rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp; skb = np->rx_skbs[rx->id]; ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); if ( unlikely(rx->status != RING_STATUS_OK) ) { /* Gate this error. We get a (valid) slew of them on suspend. */ if ( np->state == STATE_ACTIVE ) printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status); dev_kfree_skb_any(skb); continue; } /* * Set up shinfo -- from alloc_skb This was particularily nasty: the * shared info is hidden at the back of the data area (presumably so it * can be shared), but on page flip it gets very spunked. */ atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->frag_list = NULL; phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; skb->data = skb->tail = skb->head + rx->offset; skb_put(skb, rx->size); skb->protocol = eth_type_trans(skb, dev); np->stats.rx_packets++; np->stats.rx_bytes += rx->size; netif_rx(skb); dev->last_rx = jiffies; } np->rx_resp_cons = i; network_alloc_rx_buffers(dev); /* Deal with hypervisor racing our resetting of rx_event. */ mb(); if ( np->net_idx->rx_resp_prod != i ) goto again; } static void network_interrupt(int irq, void *unused, struct pt_regs *ptregs) { struct list_head *ent; struct net_private *np; list_for_each ( ent, &dev_list ) { np = list_entry(ent, struct net_private, list); _network_interrupt(np->dev); } } int network_close(struct net_device *dev) { struct net_private *np = dev->priv; netop_t netop; np->state = STATE_SUSPENDED; wmb(); netif_stop_queue(np->dev); netop.cmd = NETOP_FLUSH_BUFFERS; netop.vif = np->idx; (void)HYPERVISOR_net_io_op(&netop); while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) || (np->tx_resp_cons != np->net_idx->tx_req_prod) ) { barrier(); current->state = TASK_INTERRUPTIBLE; schedule_timeout(1); } wmb(); np->state = STATE_CLOSED; wmb(); /* Now no longer safe to take interrupts for this device. */ clear_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx); MOD_DEC_USE_COUNT; return 0; } static struct net_device_stats *network_get_stats(struct net_device *dev) { struct net_private *np = (struct net_private *)dev->priv; return &np->stats; } /* * This notifier is installed for domain 0 only. * All other domains have VFR rules installed on their behalf by domain 0 * when they are created. For bootstrap, Xen creates wildcard rules for * domain 0 -- this notifier is used to detect when we find our proper * IP address, so we can poke down proper rules and remove the wildcards. */ static int inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; struct list_head *ent; struct net_private *np; int idx = -1; network_op_t op; list_for_each ( ent, &dev_list ) { np = list_entry(dev_list.next, struct net_private, list); if ( np->dev == dev ) idx = np->idx; } if ( idx == -1 ) goto out; memset(&op, 0, sizeof(op)); op.u.net_rule.proto = NETWORK_PROTO_ANY; op.u.net_rule.action = NETWORK_ACTION_ACCEPT; if ( event == NETDEV_UP ) op.cmd = NETWORK_OP_ADDRULE; else if ( event == NETDEV_DOWN ) op.cmd = NETWORK_OP_DELETERULE; else goto out; op.u.net_rule.src_vif = idx; op.u.net_rule.dst_vif = VIF_PHYSICAL_INTERFACE; op.u.net_rule.src_addr = ntohl(ifa->ifa_address); op.u.net_rule.src_addr_mask = ~0UL; op.u.net_rule.dst_addr = 0; op.u.net_rule.dst_addr_mask = 0; (void)HYPERVISOR_network_op(&op); op.u.net_rule.src_vif = VIF_ANY_INTERFACE; op.u.net_rule.dst_vif = idx; op.u.net_rule.src_addr = 0; op.u.net_rule.src_addr_mask = 0; op.u.net_rule.dst_addr = ntohl(ifa->ifa_address); op.u.net_rule.dst_addr_mask = ~0UL; (void)HYPERVISOR_network_op(&op); out: return NOTIFY_DONE; } static struct notifier_block notifier_inetdev = { .notifier_call = inetdev_notify, .next = NULL, .priority = 0 }; int __init init_module(void) { int i, fixmap_idx=-1, err; struct net_device *dev; struct net_private *np; netop_t netop; INIT_LIST_HEAD(&dev_list); /* * Domain 0 must poke its own network rules as it discovers its IP * addresses. All other domains have a privileged "parent" to do this for * them at start of day. */ if ( start_info.dom_id == 0 ) (void)register_inetaddr_notifier(¬ifier_inetdev); err = request_irq(NET_IRQ, network_interrupt, SA_SAMPLE_RANDOM, "network", NULL); if ( err ) { printk(KERN_WARNING "Could not allocate network interrupt\n"); goto fail; } err = request_irq(_EVENT_DEBUG, dbg_network_int, 0, "debug", NULL); if ( err ) printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); for ( i = 0; i < MAX_DOMAIN_VIFS; i++ ) { /* If the VIF is invalid then the query hypercall will fail. */ netop.cmd = NETOP_GET_VIF_INFO; netop.vif = i; if ( HYPERVISOR_net_io_op(&netop) != 0 ) continue; /* We actually only support up to 4 vifs right now. */ if ( ++fixmap_idx == 4 ) break; dev = alloc_etherdev(sizeof(struct net_private)); if ( dev == NULL ) { err = -ENOMEM; goto fail; } np = dev->priv; np->state = STATE_CLOSED; np->net_ring_fixmap_idx = fixmap_idx; np->idx = i; SET_MODULE_OWNER(dev); dev->open = network_open; dev->hard_start_xmit = network_start_xmit; dev->stop = network_close; dev->get_stats = network_get_stats; memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); if ( (err = register_netdev(dev)) != 0 ) { kfree(dev); goto fail; } np->dev = dev; list_add(&np->list, &dev_list); } return 0; fail: cleanup_module(); return err; } static void cleanup_module(void) { struct net_private *np; struct net_device *dev; while ( !list_empty(&dev_list) ) { np = list_entry(dev_list.next, struct net_private, list); list_del(&np->list); dev = np->dev; unregister_netdev(dev); kfree(dev); } if ( start_info.dom_id == 0 ) (void)unregister_inetaddr_notifier(¬ifier_inetdev); } module_init(init_module); module_exit(cleanup_module);